Associations of features for 3979 uniprot IDs or shared 8861 Ensembl keyIDs -

multimapping, mismap, standard deviation of versions, distance metrics (laser plots)

  • purpose to better understand mismap scores and things influencing mismapping

originally made 9.18.19, updated on 10.22.19, updated on 12.11.19 # scatter plot ukb level mismap scorev1.0 with multimapping ENSP column counts

In [1]:
import os
import sys
import numpy as np
import pandas as pd
import csv
from ast import literal_eval
from statistics import mean
# plotting
import plotly.express as px
import seaborn as sns
import matplotlib.pyplot as plt
from scipy import stats
In [2]:
sys.path.append("/Users/mariapalafox/Desktop/Toolbox/")
from all_funx import *
from IPython.display import display, HTML
pd.set_option('display.max_columns', None)
pd.options.display.max_seq_items = 2000
display(HTML("<style>.container {width:90% !important;}</style>"))
In [3]:
os.chdir("/Users/mariapalafox/Box Sync/CODE_DATA/dir_MAPpaper/TSV_UNIPROT_xref/MULTIMAPPING_IDS")
print(os.listdir())
['ultimate_UKB_sequence_key_3953.csv', 'MISMAP2.0', 'GENE_ID_UKB.csv', 'GENE_ID_UKB.xlsx', '.DS_Store', 'ALL_releases_list_stdev_ID_versions_3953.csv', 'ENSEMBL_SEQ_v97_3953_UKBIDs.csv', 'ENSEMBL_SEQ_v96_3953_UKBIDs.csv', 'ALL_releases_list_stdev_ID_versions_3887.csv', 'ALL_releases_set_counts_ID_types_3953 copy.csv', 'SEQ_distance_UKBvsENSP', 'dict_3953_C_value_counts.csv', 'G6PD_pulledFrom_3953groupby_set_IDcount.csv', 'UKBID_3953set.csv', 'ALL_releases_set_counts_ID_types_3887.csv', 'dict_3953_CK_value_counts.csv', 'xref_merge_ENSEMBL_FASTA', 'uniprot-filtered-organism__Homo+sapiens+(Human)+[9606]_+AND+review--.tab', 'ENSEMBL_SEQ_v94_3953_UKBIDs.csv', 'GENE_ID_KEY_NOEXCEL_OPENING.csv', 'ALL_releases_list_stdev_ID_versions_3887 copy.csv', 'dict_3953_K_value_counts.csv', 'ENSEMBL_SEQ_v85_3953_UKBIDs.csv', 'dict_with_CK_counts_3953.csv', 'ALL_releases_set_counts_ID_types_3953.csv', 'ENSEMBL_SEQ_v92_3953_UKBIDs.csv', 'UKBID_3887set.csv']

[previously needed code] checking for differences in ENSP ID counts between the releases

previously did this when i had total ENSP per ID instead of what i have now, unique # ENSP

# np.where can compare multiple columm rows values
# cleanest way is to check all columns against the first column

dfcount = df[['v97_ENSP_count', 'v85_ENSP_count',
       'v92_ENSP_count', 'v94_ENSP_count', 'v96_ENSP_count']].copy()
dfcount.head(3)

v97_ENSP_count  v85_ENSP_count  v92_ENSP_count  v94_ENSP_count  v96_ENSP_count
0   2   2   2   2   2
1   1   1   1   1   1
2   2   3   2   2   2


dfcount.eq(dfcount.iloc[:,0],axis=0)

# Now you can use all (if they are all equal to the first item, they are all equal)
df['sameCountENSP'] = dfcount.eq(dfcount.iloc[:, 0], axis=0).all(axis=1)

prev. results

  • for uniprot IDs with consistent number of ENSP IDs linked, the avg frac_missed score is 0.15

  • for uniprot IDs that DO NOT have consistent number of ENSP IDs linked, the avg frac_missed score is 0.29

# dfc is counts version of df
def summarizeMultimapping(dfc,df):
    mea = dfc.mean(axis=1)
    st = dfc.std(axis=1)
    su = dfc.sum(axis=1)
    # ID and gene name from df version of data
    xref = df.xref
    gene = df.geneNamePrimary
    dsum = pd.concat([xref, gene, su, mea, st],axis=1)
    dsum.columns = ['xref','geneNamePrimary','totalENSPlinked','meanNumENSPlinked','stdENSPlinked']
    dfinal = dsum.sort_values('totalENSPlinked')
    return dfinal


dfallcounts = df[['v97_ENSP_count', 'v85_ENSP_count',
       'v92_ENSP_count', 'v94_ENSP_count', 'v96_ENSP_count']].copy()

dall = summarizeMultimapping(dfallcounts, df)

# mapping these columns to df
ref_tot = dict(zip(dall.xref, dall.totalENSPlinked))
ref_mean = dict(zip(dall.xref, dall.meanNumENSPlinked))
ref_std = dict(zip(dall.xref, dall.stdENSPlinked))

# adding gene name column 
df['total_ENSP_linked'] = df['xref']
df['mean_ENSP_linked'] = df['xref']
df['std_ENSP_linked'] = df['xref']
df.total_ENSP_linked = df.total_ENSP_linked.map(ref_tot)
df.mean_ENSP_linked = df.mean_ENSP_linked.map(ref_mean)
df.std_ENSP_linked = df.std_ENSP_linked.map(ref_std)



MAKING GENE KEY

In [26]:
# improting uniprot gene name key to start adding genes names iwth IDs
genes = pd.read_csv("GENE_ID_UKB.csv")
genes.head(5)
Out[26]:
ID gene_names cross_ref_HGNC
0 Q96IY4 CPB2 1361;
1 P22362 CCL1 SCYA1 6346;
2 Q8NCR9 CLRN3 TMEM12 USH3AL1 119467;
3 Q8IUK8 CBLN2 UNQ1892/PRO4338 147381;
4 Q9BX69 CARD6 84674;
In [27]:
genes.dropna(inplace = True)
In [28]:
# split gene col create new col 
splitdf = genes["gene_names"].str.split(" ", n=1, expand=True)
splitdf_final = splitdf[0]
splitdf_final.columns = ['HGNC_gene']
genes = pd.concat([genes, splitdf_final], axis=1)
In [29]:
genes.columns = ['ID', 'GeneName_primary', 'HGNC_ID', 'HGNC_name']
genes.columns
Out[29]:
Index(['ID', 'GeneName_primary', 'HGNC_ID', 'HGNC_name'], dtype='object')
In [30]:
#genes.to_csv("GENE_ID_KEY_NOEXCEL_OPENING.csv", index=False)
In [31]:
ref_gene = dict(zip(genes.ID, genes.HGNC_name))



Importing files with set and list columns for n=3953

In [15]:
allset = pd.read_csv("ALL_releases_set_counts_ID_types_3953.csv")
alllist = pd.read_csv("ALL_releases_list_stdev_ID_versions_3953.csv")
ultimate = pd.read_csv("ultimate_UKB_sequence_key_3953.csv")
mismap = pd.read_csv("MISMAP2.0/R_dynamic_slope_scores_3953.csv")
In [16]:
allset.columns = ['ID', 'ENSP', 'ENSPv', 'ENST', 'ENSTv', 'ENSG', 'ENSGv',
       'stableID_key', 'proSequence', 'count_ENSP', 'count_ENSPv',
       'count_ENST', 'count_ENSTv', 'count_ENSG', 'count_ENSGv',
       'count_stableID_key', 'count_proSequence']
allset.set_index('ID', inplace=True)
allset.head(3)
Out[16]:
ENSP ENSPv ENST ENSTv ENSG ENSGv stableID_key proSequence count_ENSP count_ENSPv count_ENST count_ENSTv count_ENSG count_ENSGv count_stableID_key count_proSequence
ID
A0AVT1 {'ENSP00000313454', 'ENSP00000399234'} {'ENSP00000313454.4', 'ENSP00000399234.2'} {'ENST00000322244', 'ENST00000420827'} {'ENST00000420827.2', 'ENST00000322244.5', 'EN... {'ENSG00000033178'} {'ENSG00000033178.13', 'ENSG00000033178.8', 'E... {'ENSG00000033178_ENST00000420827_ENSP00000399... {'MEGSEPVAAHQGEEASCSSWGTGSTNKNLPIMSTASVEIDDALY... 2 2 2 4 1 3 2 2
A0FGR8 {'ENSP00000251527'} {'ENSP00000251527.5', 'ENSP00000251527.6'} {'ENST00000251527'} {'ENST00000251527.10', 'ENST00000251527.5', 'E... {'ENSG00000117868'} {'ENSG00000117868.16', 'ENSG00000117868.11', '... {'ENSG00000117868_ENST00000251527_ENSP00000251... {'MTPPSRAEAGVRRSRVPSEGRWRGAEPPGISASTQPASAGRAAR... 1 2 1 3 1 3 1 2
A0JNW5 {'ENSP00000349285', 'ENSP00000444824', 'ENSP00... {'ENSP00000444824.2', 'ENSP00000349285.3', 'EN... {'ENST00000545232', 'ENST00000356828', 'ENST00... {'ENST00000545232.2', 'ENST00000279907.11', 'E... {'ENSG00000111647'} {'ENSG00000111647.13', 'ENSG00000111647.12', '... {'ENSG00000111647_ENST00000279907_ENSP00000279... {'MAGIIKKQILKHLSRFTKNLSPDKINLSTLKGEGELKNLELDEE... 3 3 3 6 1 3 3 3
In [17]:
alllist.columns = ['ID', 'Length', 'pro_ver', 'tx_ver', 'gen_ver', 'stdev_length',
       'stdev_prov', 'stdev_txv', 'stdev_genv']
alllist.set_index('ID', inplace=True)
alllist.head(3)
Out[17]:
Length pro_ver tx_ver gen_ver stdev_length stdev_prov stdev_txv stdev_genv
ID
A0AVT1 [1052, 389, 1052, 389, 1052, 389, 1052, 389, 1... [4, 2, 4, 2, 4, 2, 4, 2, 4, 2] [5, 2, 9, 2, 9, 2, 10, 2, 10, 2] [8, 8, 12, 12, 12, 12, 13, 13, 13, 13] 349.431681 1.054093 3.743142 1.955050
A0FGR8 [893, 893, 893, 845, 845] [5, 5, 5, 6, 6] [5, 9, 9, 10, 10] [11, 15, 15, 16, 16] 26.290683 0.547723 2.073644 2.073644
A0JNW5 [1464, 522, 1114, 1464, 522, 1464, 522, 1464, ... [7, 3, 2, 7, 3, 7, 3, 7, 3, 7, 3] [7, 3, 2, 11, 7, 11, 7, 12, 7, 12, 7] [8, 8, 8, 12, 12, 12, 12, 13, 13, 13, 13] 472.410838 2.195036 3.400535 2.148996
In [18]:
ultimate = ultimate[['ID', 'labeled_pos_count']].copy()
ultimate.set_index('ID', inplace=True)
ultimate.head(3)
Out[18]:
labeled_pos_count
ID
Q9HAS0 1
Q86X76 3
Q9NQR4 7
In [19]:
mismap.set_index('ID',inplace=True)
mismap.head(3)
Out[19]:
frac_missed dynamic_slope_scores
ID
A0AVT1 [0.4090909090909091, 0.4090909090909091, 0.409... 1
A0FGR8 [1.0, 1.0, 1.0, 1.0, 1.0] 1
A0JNW5 [0.6666666666666666, 0.5, 0.5, 0.5, 0.5] 2
In [20]:
mer = pd.concat([allset, alllist, ultimate, mismap], axis=1)
mer.head(3)
/anaconda3/lib/python3.7/site-packages/ipykernel_launcher.py:1: FutureWarning:

Sorting because non-concatenation axis is not aligned. A future version
of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.

To retain the current behavior and silence the warning, pass 'sort=True'.


Out[20]:
ENSP ENSPv ENST ENSTv ENSG ENSGv stableID_key proSequence count_ENSP count_ENSPv count_ENST count_ENSTv count_ENSG count_ENSGv count_stableID_key count_proSequence Length pro_ver tx_ver gen_ver stdev_length stdev_prov stdev_txv stdev_genv labeled_pos_count frac_missed dynamic_slope_scores
A0AVT1 {'ENSP00000313454', 'ENSP00000399234'} {'ENSP00000313454.4', 'ENSP00000399234.2'} {'ENST00000322244', 'ENST00000420827'} {'ENST00000420827.2', 'ENST00000322244.5', 'EN... {'ENSG00000033178'} {'ENSG00000033178.13', 'ENSG00000033178.8', 'E... {'ENSG00000033178_ENST00000420827_ENSP00000399... {'MEGSEPVAAHQGEEASCSSWGTGSTNKNLPIMSTASVEIDDALY... 2 2 2 4 1 3 2 2 [1052, 389, 1052, 389, 1052, 389, 1052, 389, 1... [4, 2, 4, 2, 4, 2, 4, 2, 4, 2] [5, 2, 9, 2, 9, 2, 10, 2, 10, 2] [8, 8, 12, 12, 12, 12, 13, 13, 13, 13] 349.431681 1.054093 3.743142 1.955050 11 [0.4090909090909091, 0.4090909090909091, 0.409... 1
A0FGR8 {'ENSP00000251527'} {'ENSP00000251527.5', 'ENSP00000251527.6'} {'ENST00000251527'} {'ENST00000251527.10', 'ENST00000251527.5', 'E... {'ENSG00000117868'} {'ENSG00000117868.16', 'ENSG00000117868.11', '... {'ENSG00000117868_ENST00000251527_ENSP00000251... {'MTPPSRAEAGVRRSRVPSEGRWRGAEPPGISASTQPASAGRAAR... 1 2 1 3 1 3 1 2 [893, 893, 893, 845, 845] [5, 5, 5, 6, 6] [5, 9, 9, 10, 10] [11, 15, 15, 16, 16] 26.290683 0.547723 2.073644 2.073644 1 [1.0, 1.0, 1.0, 1.0, 1.0] 1
A0JNW5 {'ENSP00000349285', 'ENSP00000444824', 'ENSP00... {'ENSP00000444824.2', 'ENSP00000349285.3', 'EN... {'ENST00000545232', 'ENST00000356828', 'ENST00... {'ENST00000545232.2', 'ENST00000279907.11', 'E... {'ENSG00000111647'} {'ENSG00000111647.13', 'ENSG00000111647.12', '... {'ENSG00000111647_ENST00000279907_ENSP00000279... {'MAGIIKKQILKHLSRFTKNLSPDKINLSTLKGEGELKNLELDEE... 3 3 3 6 1 3 3 3 [1464, 522, 1114, 1464, 522, 1464, 522, 1464, ... [7, 3, 2, 7, 3, 7, 3, 7, 3, 7, 3] [7, 3, 2, 11, 7, 11, 7, 12, 7, 12, 7] [8, 8, 8, 12, 12, 12, 12, 13, 13, 13, 13] 472.410838 2.195036 3.400535 2.148996 1 [0.6666666666666666, 0.5, 0.5, 0.5, 0.5] 2


















importing ENSP-specific files from each release

with distance from UKB canonical sequence for each ENSP

2 versions:

  1. all ensp linked to ALL 3953 ukb IDs per release (diff row count)
  2. all ensp SHARING 8861 ENSG_ENST_ENSP keys in all releases (same row count)
  3. all ensp FALSE identity to canonical seq. with 1796 ukb IDs (diff row count)
In [32]:
os.chdir("/Users/mariapalafox/Box Sync/CODE_DATA/dir_MAPpaper/TSV_UNIPROT_xref/MULTIMAPPING_IDS/MISMAP2.0/")
In [33]:
v97s = pd.read_csv("ENSEMBL_MISMAP_SCORED_v97_8861shared_stableIDkeys.csv")
v97a = pd.read_csv("ENSEMBL_MISMAP_SCORED_v97_3953UKB.csv")
v97f = pd.read_csv("ENSEMBL_MISMAP_SCORED_v97_1796UKBIDs.csv")

v96s = pd.read_csv("ENSEMBL_MISMAP_SCORED_v96_8861shared_stableIDkeys.csv")
v96a = pd.read_csv("ENSEMBL_MISMAP_SCORED_v96_3953UKB.csv")
v96f = pd.read_csv("ENSEMBL_MISMAP_SCORED_v96_1796UKBIDs.csv")

v94s = pd.read_csv("ENSEMBL_MISMAP_SCORED_v94_8861shared_stableIDkeys.csv")
v94a = pd.read_csv("ENSEMBL_MISMAP_SCORED_v94_3953UKB.csv")
v94f = pd.read_csv("ENSEMBL_MISMAP_SCORED_v94_1796UKBIDs.csv")

v92s = pd.read_csv("ENSEMBL_MISMAP_SCORED_v92_8861shared_stableIDkeys.csv")
v92a = pd.read_csv("ENSEMBL_MISMAP_SCORED_v92_3953UKB.csv")
v92f = pd.read_csv("ENSEMBL_MISMAP_SCORED_v92_1796UKBIDs.csv")

v85s = pd.read_csv("ENSEMBL_MISMAP_SCORED_v85_8861shared_stableIDkeys.csv")
v85a = pd.read_csv("ENSEMBL_MISMAP_SCORED_v85_3953UKB.csv")
v85f = pd.read_csv("ENSEMBL_MISMAP_SCORED_v85_1796UKBIDs.csv")

^these files were used to create boxplots for FRACTION MISSED SCORES in panel 2, before i normalized by ENSP multimapping counts. These mismap scores are in MAPPING.xlsx as general level mismap...probabilities can be mentioned in paper

In [34]:
dfs = [v85a, v92a, v94a, v96a, v97a]
for i in dfs: 
    print(i.shape)
(10183, 34)
(10395, 34)
(10612, 34)
(10663, 34)
(10564, 34)
In [35]:
dfs = [v85s, v92s, v94s, v96s, v97s]
for i in dfs: 
    print(i.shape)
(8861, 34)
(8861, 34)
(8861, 34)
(8861, 34)
(8861, 34)
In [36]:
dff = [v85f, v92f, v94f, v96f, v97f]
for i in dff: 
    print(i.shape)
(4189, 35)
(4074, 35)
(4148, 35)
(4166, 35)
(4143, 35)
In [37]:
print(v85a.columns)
print()
print(v85s.columns)
print()
print(v85f.columns)
Index(['ENSPv', 'ENSP', 'ENSTv', 'ENST', 'ENSGv', 'ENSG', 'Length',
       'proSequence', 'stableID_key', 'ID', 'pro_ver', 'tx_ver', 'gen_ver',
       'identical_2UKBseq', 'Length_UKB', 'len_ensp_minus_ukb',
       'hamming_distance', 'hamming_normalized_dist', 'levenshtein_distance',
       'levenshtein_normalized_dist', 'key_8861', 'entryName',
       'labeled_pos_count', 'pos_dict', 'count_C_targets', 'count_K_targets',
       'found_count', 'found_count_C', 'found_count_K', 'missed_count',
       'missed_count_C', 'missed_count_K', 'correct_frac', 'missed_frac'],
      dtype='object')

Index(['ENSPv', 'ENSP', 'ENSTv', 'ENST', 'ENSGv', 'ENSG', 'Length',
       'proSequence', 'stableID_key', 'ID', 'pro_ver', 'tx_ver', 'gen_ver',
       'identical_2UKBseq', 'Length_UKB', 'len_ensp_minus_ukb',
       'hamming_distance', 'hamming_normalized_dist', 'levenshtein_distance',
       'levenshtein_normalized_dist', 'key_8861', 'entryName',
       'labeled_pos_count', 'pos_dict', 'count_C_targets', 'count_K_targets',
       'found_count', 'found_count_C', 'found_count_K', 'missed_count',
       'missed_count_C', 'missed_count_K', 'correct_frac', 'missed_frac'],
      dtype='object')

Index(['ENSPv', 'ENSP', 'ENSTv', 'ENST', 'ENSGv', 'ENSG', 'Length',
       'proSequence', 'stableID_key', 'ID', 'pro_ver', 'tx_ver', 'gen_ver',
       'identical_2UKBseq', 'Length_UKB', 'len_ensp_minus_ukb',
       'hamming_distance', 'hamming_normalized_dist', 'levenshtein_distance',
       'levenshtein_normalized_dist', 'key_8861', 'entryName',
       'labeled_pos_count', 'pos_dict', 'count_C_targets', 'count_K_targets',
       'found_count', 'found_count_C', 'found_count_K', 'missed_count',
       'missed_count_C', 'missed_count_K', 'correct_frac', 'missed_frac',
       'UKBIDinFALSEReleases'],
      dtype='object')

ENSP-level fraction missed scores

frac missed = # positions missed / # of positions searched based on total per UKB ID

In [38]:
# adding column for release # before merging
v97a['release'] = 97
v96a['release'] = 96
v94a['release'] = 94
v92a['release'] = 92
v85a['release'] = 85

v97s['release'] = 97
v96s['release'] = 96
v94s['release'] = 94
v92s['release'] = 92
v85s['release'] = 85

v97f['release'] = 97
v96f['release'] = 96
v94f['release'] = 94
v92f['release'] = 92
v85f['release'] = 85

concat all release files with different #'s of ENSP (except shared sets) together

In [39]:
all_mer = pd.concat([v85a, v92a, v94a, v96a, v97a])

share_mer = pd.concat([v85s, v92s, v94s, v96s, v97s])

false_mer = pd.concat([v85f, v92f, v94f, v96f, v97f])
In [40]:
# adding gene name column 
all_mer['HGNC_gene'] = all_mer['ID']
all_mer.HGNC_gene = all_mer.HGNC_gene.map(ref_gene)
all_mer.head(4)
Out[40]:
ENSPv ENSP ENSTv ENST ENSGv ENSG Length proSequence stableID_key ID pro_ver tx_ver gen_ver identical_2UKBseq Length_UKB len_ensp_minus_ukb hamming_distance hamming_normalized_dist levenshtein_distance levenshtein_normalized_dist key_8861 entryName labeled_pos_count pos_dict count_C_targets count_K_targets found_count found_count_C found_count_K missed_count missed_count_C missed_count_K correct_frac missed_frac release HGNC_gene
0 ENSP00000000233.5 ENSP00000000233 ENST00000000233.5 ENST00000000233 ENSG00000004059.6 ENSG00000004059 180 MGLTVSALFSRIFGKKQMRILMVGLDAAGKTTILYKLKLGEIVTTI... ENSG00000004059_ENST00000000233_ENSP00000000233 P84085 5 5 6 True 180 0 0.0 0.000000 0.0 0.000000 True ARF5_HUMAN 4 {38: 'K', 62: 'C', 159: 'C', 179: 'K'} 2 2 4 2 2 0 0 0 1.0 0.0 85 ARF5
1 ENSP00000001008.4 ENSP00000001008 ENST00000001008.4 ENST00000001008 ENSG00000004478.5 ENSG00000004478 459 MTAEEMKATESGAQSAPLPMEGVDISPKQDEGVLKVIKREGTGTEM... ENSG00000004478_ENST00000001008_ENSP00000001008 Q02790 4 4 5 True 459 0 0.0 0.000000 0.0 0.000000 True FKBP4_HUMAN 16 {28: 'K', 103: 'C', 222: 'K', 274: 'K', 282: '... 3 13 16 3 13 0 0 0 1.0 0.0 85 FKBP4
2 ENSP00000003302.4 ENSP00000003302 ENST00000003302.4 ENST00000003302 ENSG00000048028.7 ENSG00000048028 1077 MTAELQQDDAAGAADGHGSSCQMLLNQLREITGIQDPSFLHEALKA... ENSG00000048028_ENST00000003302_ENSP00000003302 Q96RU2 4 4 7 True 1077 0 0.0 0.000000 0.0 0.000000 True UBP28_HUMAN 3 {45: 'K', 171: 'C', 733: 'C'} 2 1 3 2 1 0 0 0 1.0 0.0 85 USP28
3 ENSP00000260188.5 ENSP00000260188 ENST00000260188.5 ENST00000260188 ENSG00000048028.7 ENSG00000048028 1045 MTAELQQDDAAGAADGHGSSCQMLLNQLREITGIQDPSFLHEALKA... ENSG00000048028_ENST00000260188_ENSP00000260188 Q96RU2 5 5 7 False 1077 -32 292.0 0.271123 32.0 0.029712 False UBP28_HUMAN 3 {45: 'K', 171: 'C', 733: 'C'} 2 1 3 2 1 0 0 0 1.0 0.0 85 USP28
In [41]:
# adding gene name column 
share_mer['HGNC_gene'] = share_mer['ID']
share_mer.HGNC_gene = share_mer.HGNC_gene.map(ref_gene)
share_mer.head(4)
Out[41]:
ENSPv ENSP ENSTv ENST ENSGv ENSG Length proSequence stableID_key ID pro_ver tx_ver gen_ver identical_2UKBseq Length_UKB len_ensp_minus_ukb hamming_distance hamming_normalized_dist levenshtein_distance levenshtein_normalized_dist key_8861 entryName labeled_pos_count pos_dict count_C_targets count_K_targets found_count found_count_C found_count_K missed_count missed_count_C missed_count_K correct_frac missed_frac release HGNC_gene
0 ENSP00000000233.5 ENSP00000000233 ENST00000000233.5 ENST00000000233 ENSG00000004059.6 ENSG00000004059 180 MGLTVSALFSRIFGKKQMRILMVGLDAAGKTTILYKLKLGEIVTTI... ENSG00000004059_ENST00000000233_ENSP00000000233 P84085 5 5 6 True 180 0 0.0 0.0 0.0 0.0 True ARF5_HUMAN 4 {38: 'K', 62: 'C', 159: 'C', 179: 'K'} 2 2 4 2 2 0 0 0 1.0 0.0 85 ARF5
1 ENSP00000001008.4 ENSP00000001008 ENST00000001008.4 ENST00000001008 ENSG00000004478.5 ENSG00000004478 459 MTAEEMKATESGAQSAPLPMEGVDISPKQDEGVLKVIKREGTGTEM... ENSG00000004478_ENST00000001008_ENSP00000001008 Q02790 4 4 5 True 459 0 0.0 0.0 0.0 0.0 True FKBP4_HUMAN 16 {28: 'K', 103: 'C', 222: 'K', 274: 'K', 282: '... 3 13 16 3 13 0 0 0 1.0 0.0 85 FKBP4
2 ENSP00000003302.4 ENSP00000003302 ENST00000003302.4 ENST00000003302 ENSG00000048028.7 ENSG00000048028 1077 MTAELQQDDAAGAADGHGSSCQMLLNQLREITGIQDPSFLHEALKA... ENSG00000048028_ENST00000003302_ENSP00000003302 Q96RU2 4 4 7 True 1077 0 0.0 0.0 0.0 0.0 True UBP28_HUMAN 3 {45: 'K', 171: 'C', 733: 'C'} 2 1 3 2 1 0 0 0 1.0 0.0 85 USP28
3 ENSP00000005259.4 ENSP00000005259 ENST00000005259.4 ENST00000005259 ENSG00000075790.6 ENSG00000075790 241 MTLQWAAVATFLYAEIGLILIFCLPFIPPQRWQKIFSFNVWGKIAT... ENSG00000075790_ENST00000005259_ENSP00000005259 Q9UHQ4 4 4 6 True 241 0 0.0 0.0 0.0 0.0 True BAP29_HUMAN 1 {96: 'K'} 0 1 1 0 1 0 0 0 1.0 0.0 85 BCAP29
In [42]:
# adding gene name column 
false_mer['HGNC_gene'] = false_mer['ID']
false_mer.HGNC_gene = false_mer.HGNC_gene.map(ref_gene)
false_mer.head(4)
Out[42]:
ENSPv ENSP ENSTv ENST ENSGv ENSG Length proSequence stableID_key ID pro_ver tx_ver gen_ver identical_2UKBseq Length_UKB len_ensp_minus_ukb hamming_distance hamming_normalized_dist levenshtein_distance levenshtein_normalized_dist key_8861 entryName labeled_pos_count pos_dict count_C_targets count_K_targets found_count found_count_C found_count_K missed_count missed_count_C missed_count_K correct_frac missed_frac UKBIDinFALSEReleases release HGNC_gene
0 ENSP00000260188.5 ENSP00000260188 ENST00000260188.5 ENST00000260188 ENSG00000048028.7 ENSG00000048028 1045 MTAELQQDDAAGAADGHGSSCQMLLNQLREITGIQDPSFLHEALKA... ENSG00000048028_ENST00000260188_ENSP00000260188 Q96RU2 5 5 7 False 1077 -32 292.0 0.271123 32.0 0.029712 False UBP28_HUMAN 3 {45: 'K', 171: 'C', 733: 'C'} 2 1 3 2 1 0 0 0 1.0 0.0 True 85 USP28
1 ENSP00000368414.2 ENSP00000368414 ENST00000379119.2 ENST00000379119 ENSG00000075790.6 ENSG00000075790 348 MTLQWAAVATFLYAEIGLILIFCLPFIPPQRWQKIFSFNVWGKIAT... ENSG00000075790_ENST00000379119_ENSP00000368414 Q9UHQ4 2 2 6 False 241 107 118.0 0.339080 110.0 0.316092 True BAP29_HUMAN 1 {96: 'K'} 0 1 1 0 1 0 0 0 1.0 0.0 True 85 BCAP29
2 ENSP00000400718.2 ENSP00000400718 ENST00000445771.2 ENST00000445771 ENSG00000075790.6 ENSG00000075790 348 MTLQWAAVATFLYAEIGLILIFCLPFIPPQRWQKIFSFNVWGKIAT... ENSG00000075790_ENST00000445771_ENSP00000400718 Q9UHQ4 2 2 6 False 241 107 118.0 0.339080 110.0 0.316092 True BAP29_HUMAN 1 {96: 'K'} 0 1 1 0 1 0 0 0 1.0 0.0 True 85 BCAP29
3 ENSP00000005374.6 ENSP00000005374 ENST00000005374.6 ENST00000005374 ENSG00000006625.13 ENSG00000006625 114 MANSGCKDVTGPDEESFLYFAYGSNLLTERIHLRNPSAAFFCVARL... ENSG00000006625_ENST00000005374_ENSP00000005374 O75223 6 6 13 False 188 -74 91.0 0.484043 83.0 0.441489 True GGCT_HUMAN 2 {42: 'C', 148: 'K'} 1 1 1 1 0 1 0 1 0.5 0.5 True 85 GGCT
In [43]:
print("shape of all 3953 in releases diff row count: ", all_mer.shape)
print("shape of shared keys in all releases same row count: ", share_mer.shape)
print("shape of false 1796 in releases diff row count: ", false_mer.shape)
shape of all 3953 in releases diff row count:  (52417, 36)
shape of shared keys in all releases same row count:  (44305, 36)
shape of false 1796 in releases diff row count:  (20720, 37)








interuption of ENSP level mismap score for making figure df:

normalized levenshtein & hamming distance- LASER R with DYNAMIC SLOPE SCORES!

made column version but really needed already made merge axis=0 file for R

In [45]:
share_mer.head(3)
Out[45]:
ENSPv ENSP ENSTv ENST ENSGv ENSG Length proSequence stableID_key ID pro_ver tx_ver gen_ver identical_2UKBseq Length_UKB len_ensp_minus_ukb hamming_distance hamming_normalized_dist levenshtein_distance levenshtein_normalized_dist key_8861 entryName labeled_pos_count pos_dict count_C_targets count_K_targets found_count found_count_C found_count_K missed_count missed_count_C missed_count_K correct_frac missed_frac release HGNC_gene
0 ENSP00000000233.5 ENSP00000000233 ENST00000000233.5 ENST00000000233 ENSG00000004059.6 ENSG00000004059 180 MGLTVSALFSRIFGKKQMRILMVGLDAAGKTTILYKLKLGEIVTTI... ENSG00000004059_ENST00000000233_ENSP00000000233 P84085 5 5 6 True 180 0 0.0 0.0 0.0 0.0 True ARF5_HUMAN 4 {38: 'K', 62: 'C', 159: 'C', 179: 'K'} 2 2 4 2 2 0 0 0 1.0 0.0 85 ARF5
1 ENSP00000001008.4 ENSP00000001008 ENST00000001008.4 ENST00000001008 ENSG00000004478.5 ENSG00000004478 459 MTAEEMKATESGAQSAPLPMEGVDISPKQDEGVLKVIKREGTGTEM... ENSG00000004478_ENST00000001008_ENSP00000001008 Q02790 4 4 5 True 459 0 0.0 0.0 0.0 0.0 True FKBP4_HUMAN 16 {28: 'K', 103: 'C', 222: 'K', 274: 'K', 282: '... 3 13 16 3 13 0 0 0 1.0 0.0 85 FKBP4
2 ENSP00000003302.4 ENSP00000003302 ENST00000003302.4 ENST00000003302 ENSG00000048028.7 ENSG00000048028 1077 MTAELQQDDAAGAADGHGSSCQMLLNQLREITGIQDPSFLHEALKA... ENSG00000048028_ENST00000003302_ENSP00000003302 Q96RU2 4 4 7 True 1077 0 0.0 0.0 0.0 0.0 True UBP28_HUMAN 3 {45: 'K', 171: 'C', 733: 'C'} 2 1 3 2 1 0 0 0 1.0 0.0 85 USP28
In [58]:
# COPIED FROM ALL FUNCTIONS

# dynamic slope score for Rmerged (all releases axis0) files
def group_scores(df):
    gdf = df.groupby('stableID_key')['hamming_normalized_dist'].apply(list)
    gdf = pd.DataFrame(gdf, index=None)
    gdf.reset_index(inplace=True)
    print("group df shape: ", gdf.shape)
    print(gdf.head(1))
    return gdf


def mismaplines_dynamic(df, col):
# expanding this out to include dynamic levels - 1 2 3 
    diffline = []
    for index, row in df.iterrows():
        ukbid = row['stableID_key']
        ls = row[col]
        #python_ls = literal_eval(ls) 
        python_ls = ls
        lenLS = len(set(python_ls))
        if lenLS == 5:
            diffline.append("5")
        if lenLS == 4:
            diffline.append("4")
        if lenLS == 3:
            diffline.append("3")
        if lenLS == 2:
            diffline.append("2")
        if lenLS == 1:
            diffline.append("1")
    df.loc[:,'dynamic_slope_scores'] = diffline
    print(df.shape)
In [54]:
#leven = group_scores(share_mer)
group df shape:  (8861, 2)
                                      stableID_key  \
0  ENSG00000001497_ENST00000374804_ENSP00000363937   

                         levenshtein_normalized_dist  
0  [0.080381471, 0.080381471, 0.080381471, 0.0803...  
In [56]:
#hammin = group_scores(share_mer)
group df shape:  (8861, 2)
                                      stableID_key  \
0  ENSG00000001497_ENST00000374804_ENSP00000363937   

                             hamming_normalized_dist  
0  [0.85013624, 0.85013624, 0.85013624, 0.8501362...  
In [59]:
mismaplines_dynamic(leven, 'levenshtein_normalized_dist')
mismaplines_dynamic(hammin, 'hamming_normalized_dist')
(8861, 3)
(8861, 3)
In [66]:
leven.columns = ['stableID_key', 'levenshtein_normalized_dist', 'dynamic_slope_LEVENnorm']
In [68]:
hammin.columns = ['stableID_key', 'hamming_normalized_dist', 'dynamic_slope_HAMMINGnorm']
In [69]:
checkColumnValues(leven, 'dynamic_slope_LEVENnorm')
checkColumnValues(hammin, 'dynamic_slope_HAMMINGnorm')
  dynamic_slope_LEVENnorm  Count
0                       1   8697
1                       2    162
2                       5      2
  dynamic_slope_HAMMINGnorm  Count
0                         1   8709
1                         2    150
2                         5      2
In [78]:
leven[leven['dynamic_slope_LEVENnorm'] == '5']
Out[78]:
stableID_key levenshtein_normalized_dist dynamic_slope_LEVENnorm
5785 ENSG00000155657_ENST00000342992_ENSP00000343764 [nan, nan, nan, nan, nan] 5
5789 ENSG00000155657_ENST00000591111_ENSP00000465570 [nan, nan, nan, nan, nan] 5
In [70]:
# map leven score 
levendict = dict(zip(leven.stableID_key, leven.dynamic_slope_LEVENnorm))
share_mer['dynamic_slope_LEVENnorm'] = share_mer['stableID_key']
share_mer.dynamic_slope_LEVENnorm = share_mer.dynamic_slope_LEVENnorm.map(levendict)
In [72]:
# map hamming score 
hamdict = dict(zip(hammin.stableID_key, hammin.dynamic_slope_HAMMINGnorm))
share_mer['dynamic_slope_HAMMINGnorm'] = share_mer['stableID_key']
share_mer.dynamic_slope_HAMMINGnorm = share_mer.dynamic_slope_HAMMINGnorm.map(hamdict)
In [80]:
share_mer[share_mer['dynamic_slope_LEVENnorm'] == '5']
Out[80]:
ENSPv ENSP ENSTv ENST ENSGv ENSG Length proSequence stableID_key ID pro_ver tx_ver gen_ver identical_2UKBseq Length_UKB len_ensp_minus_ukb hamming_distance hamming_normalized_dist levenshtein_distance levenshtein_normalized_dist key_8861 entryName labeled_pos_count pos_dict count_C_targets count_K_targets found_count found_count_C found_count_K missed_count missed_count_C missed_count_K correct_frac missed_frac release HGNC_gene dynamic_slope_LEVENnorm dynamic_slope_HAMMINGnorm
5944 ENSP00000343764.6 ENSP00000343764 ENST00000342992.6 ENST00000342992 ENSG00000155657.19 ENSG00000155657 33423 MTTQAPTFTQPLQSVVVLEGSTATFEAHISGFPVPEVSWFRDGQVI... ENSG00000155657_ENST00000342992_ENSP00000343764 Q8WZ42 6 6 19 False 34350 -927 NaN NaN NaN NaN True TITIN_HUMAN 1 {5248: 'C'} 1 0 0 0 0 1 1 0 0.0 1.0 85 TTN 5 5
5948 ENSP00000465570.1 ENSP00000465570 ENST00000591111.1 ENST00000591111 ENSG00000155657.19 ENSG00000155657 34350 MTTQAPTFTQPLQSVVVLEGSTATFEAHISGFPVPEVSWFRDGQVI... ENSG00000155657_ENST00000591111_ENSP00000465570 Q8WZ42 1 1 19 True 34350 0 NaN NaN NaN NaN True TITIN_HUMAN 1 {5248: 'C'} 1 0 1 1 0 0 0 0 1.0 0.0 85 TTN 5 5
5997 ENSP00000343764.6 ENSP00000343764 ENST00000342992.10 ENST00000342992 ENSG00000155657.26 ENSG00000155657 33423 MTTQAPTFTQPLQSVVVLEGSTATFEAHISGFPVPEVSWFRDGQVI... ENSG00000155657_ENST00000342992_ENSP00000343764 Q8WZ42 6 10 26 False 34350 -927 NaN NaN NaN NaN True TITIN_HUMAN 1 {5248: 'C'} 1 0 0 0 0 1 1 0 0.0 1.0 92 TTN 5 5
6001 ENSP00000465570.1 ENSP00000465570 ENST00000591111.5 ENST00000591111 ENSG00000155657.26 ENSG00000155657 34350 MTTQAPTFTQPLQSVVVLEGSTATFEAHISGFPVPEVSWFRDGQVI... ENSG00000155657_ENST00000591111_ENSP00000465570 Q8WZ42 1 5 26 True 34350 0 NaN NaN NaN NaN True TITIN_HUMAN 1 {5248: 'C'} 1 0 1 1 0 0 0 0 1.0 0.0 92 TTN 5 5
5996 ENSP00000343764.6 ENSP00000343764 ENST00000342992.10 ENST00000342992 ENSG00000155657.26 ENSG00000155657 33423 MTTQAPTFTQPLQSVVVLEGSTATFEAHISGFPVPEVSWFRDGQVI... ENSG00000155657_ENST00000342992_ENSP00000343764 Q8WZ42 6 10 26 False 34350 -927 NaN NaN NaN NaN True TITIN_HUMAN 1 {5248: 'C'} 1 0 0 0 0 1 1 0 0.0 1.0 94 TTN 5 5
6000 ENSP00000465570.1 ENSP00000465570 ENST00000591111.5 ENST00000591111 ENSG00000155657.26 ENSG00000155657 34350 MTTQAPTFTQPLQSVVVLEGSTATFEAHISGFPVPEVSWFRDGQVI... ENSG00000155657_ENST00000591111_ENSP00000465570 Q8WZ42 1 5 26 True 34350 0 NaN NaN NaN NaN True TITIN_HUMAN 1 {5248: 'C'} 1 0 1 1 0 0 0 0 1.0 0.0 94 TTN 5 5
5992 ENSP00000343764.6 ENSP00000343764 ENST00000342992.10 ENST00000342992 ENSG00000155657.26 ENSG00000155657 33423 MTTQAPTFTQPLQSVVVLEGSTATFEAHISGFPVPEVSWFRDGQVI... ENSG00000155657_ENST00000342992_ENSP00000343764 Q8WZ42 6 10 26 False 34350 -927 NaN NaN NaN NaN True TITIN_HUMAN 1 {5248: 'C'} 1 0 0 0 0 1 1 0 0.0 1.0 96 TTN 5 5
5996 ENSP00000465570.1 ENSP00000465570 ENST00000591111.5 ENST00000591111 ENSG00000155657.26 ENSG00000155657 34350 MTTQAPTFTQPLQSVVVLEGSTATFEAHISGFPVPEVSWFRDGQVI... ENSG00000155657_ENST00000591111_ENSP00000465570 Q8WZ42 1 5 26 True 34350 0 NaN NaN NaN NaN True TITIN_HUMAN 1 {5248: 'C'} 1 0 1 1 0 0 0 0 1.0 0.0 96 TTN 5 5
5955 ENSP00000343764.6 ENSP00000343764 ENST00000342992.10 ENST00000342992 ENSG00000155657.26 ENSG00000155657 33423 MTTQAPTFTQPLQSVVVLEGSTATFEAHISGFPVPEVSWFRDGQVI... ENSG00000155657_ENST00000342992_ENSP00000343764 Q8WZ42 6 10 26 False 34350 -927 NaN NaN NaN NaN True TITIN_HUMAN 1 {5248: 'C'} 1 0 0 0 0 1 1 0 0.0 1.0 97 TTN 5 5
5959 ENSP00000465570.1 ENSP00000465570 ENST00000591111.5 ENST00000591111 ENSG00000155657.26 ENSG00000155657 34350 MTTQAPTFTQPLQSVVVLEGSTATFEAHISGFPVPEVSWFRDGQVI... ENSG00000155657_ENST00000591111_ENSP00000465570 Q8WZ42 1 5 26 True 34350 0 NaN NaN NaN NaN True TITIN_HUMAN 1 {5248: 'C'} 1 0 1 1 0 0 0 0 1.0 0.0 97 TTN 5 5
In [81]:
share_mer[share_mer['dynamic_slope_HAMMINGnorm'] == '5']
Out[81]:
ENSPv ENSP ENSTv ENST ENSGv ENSG Length proSequence stableID_key ID pro_ver tx_ver gen_ver identical_2UKBseq Length_UKB len_ensp_minus_ukb hamming_distance hamming_normalized_dist levenshtein_distance levenshtein_normalized_dist key_8861 entryName labeled_pos_count pos_dict count_C_targets count_K_targets found_count found_count_C found_count_K missed_count missed_count_C missed_count_K correct_frac missed_frac release HGNC_gene dynamic_slope_LEVENnorm dynamic_slope_HAMMINGnorm
5944 ENSP00000343764.6 ENSP00000343764 ENST00000342992.6 ENST00000342992 ENSG00000155657.19 ENSG00000155657 33423 MTTQAPTFTQPLQSVVVLEGSTATFEAHISGFPVPEVSWFRDGQVI... ENSG00000155657_ENST00000342992_ENSP00000343764 Q8WZ42 6 6 19 False 34350 -927 NaN NaN NaN NaN True TITIN_HUMAN 1 {5248: 'C'} 1 0 0 0 0 1 1 0 0.0 1.0 85 TTN 5 5
5948 ENSP00000465570.1 ENSP00000465570 ENST00000591111.1 ENST00000591111 ENSG00000155657.19 ENSG00000155657 34350 MTTQAPTFTQPLQSVVVLEGSTATFEAHISGFPVPEVSWFRDGQVI... ENSG00000155657_ENST00000591111_ENSP00000465570 Q8WZ42 1 1 19 True 34350 0 NaN NaN NaN NaN True TITIN_HUMAN 1 {5248: 'C'} 1 0 1 1 0 0 0 0 1.0 0.0 85 TTN 5 5
5997 ENSP00000343764.6 ENSP00000343764 ENST00000342992.10 ENST00000342992 ENSG00000155657.26 ENSG00000155657 33423 MTTQAPTFTQPLQSVVVLEGSTATFEAHISGFPVPEVSWFRDGQVI... ENSG00000155657_ENST00000342992_ENSP00000343764 Q8WZ42 6 10 26 False 34350 -927 NaN NaN NaN NaN True TITIN_HUMAN 1 {5248: 'C'} 1 0 0 0 0 1 1 0 0.0 1.0 92 TTN 5 5
6001 ENSP00000465570.1 ENSP00000465570 ENST00000591111.5 ENST00000591111 ENSG00000155657.26 ENSG00000155657 34350 MTTQAPTFTQPLQSVVVLEGSTATFEAHISGFPVPEVSWFRDGQVI... ENSG00000155657_ENST00000591111_ENSP00000465570 Q8WZ42 1 5 26 True 34350 0 NaN NaN NaN NaN True TITIN_HUMAN 1 {5248: 'C'} 1 0 1 1 0 0 0 0 1.0 0.0 92 TTN 5 5
5996 ENSP00000343764.6 ENSP00000343764 ENST00000342992.10 ENST00000342992 ENSG00000155657.26 ENSG00000155657 33423 MTTQAPTFTQPLQSVVVLEGSTATFEAHISGFPVPEVSWFRDGQVI... ENSG00000155657_ENST00000342992_ENSP00000343764 Q8WZ42 6 10 26 False 34350 -927 NaN NaN NaN NaN True TITIN_HUMAN 1 {5248: 'C'} 1 0 0 0 0 1 1 0 0.0 1.0 94 TTN 5 5
6000 ENSP00000465570.1 ENSP00000465570 ENST00000591111.5 ENST00000591111 ENSG00000155657.26 ENSG00000155657 34350 MTTQAPTFTQPLQSVVVLEGSTATFEAHISGFPVPEVSWFRDGQVI... ENSG00000155657_ENST00000591111_ENSP00000465570 Q8WZ42 1 5 26 True 34350 0 NaN NaN NaN NaN True TITIN_HUMAN 1 {5248: 'C'} 1 0 1 1 0 0 0 0 1.0 0.0 94 TTN 5 5
5992 ENSP00000343764.6 ENSP00000343764 ENST00000342992.10 ENST00000342992 ENSG00000155657.26 ENSG00000155657 33423 MTTQAPTFTQPLQSVVVLEGSTATFEAHISGFPVPEVSWFRDGQVI... ENSG00000155657_ENST00000342992_ENSP00000343764 Q8WZ42 6 10 26 False 34350 -927 NaN NaN NaN NaN True TITIN_HUMAN 1 {5248: 'C'} 1 0 0 0 0 1 1 0 0.0 1.0 96 TTN 5 5
5996 ENSP00000465570.1 ENSP00000465570 ENST00000591111.5 ENST00000591111 ENSG00000155657.26 ENSG00000155657 34350 MTTQAPTFTQPLQSVVVLEGSTATFEAHISGFPVPEVSWFRDGQVI... ENSG00000155657_ENST00000591111_ENSP00000465570 Q8WZ42 1 5 26 True 34350 0 NaN NaN NaN NaN True TITIN_HUMAN 1 {5248: 'C'} 1 0 1 1 0 0 0 0 1.0 0.0 96 TTN 5 5
5955 ENSP00000343764.6 ENSP00000343764 ENST00000342992.10 ENST00000342992 ENSG00000155657.26 ENSG00000155657 33423 MTTQAPTFTQPLQSVVVLEGSTATFEAHISGFPVPEVSWFRDGQVI... ENSG00000155657_ENST00000342992_ENSP00000343764 Q8WZ42 6 10 26 False 34350 -927 NaN NaN NaN NaN True TITIN_HUMAN 1 {5248: 'C'} 1 0 0 0 0 1 1 0 0.0 1.0 97 TTN 5 5
5959 ENSP00000465570.1 ENSP00000465570 ENST00000591111.5 ENST00000591111 ENSG00000155657.26 ENSG00000155657 34350 MTTQAPTFTQPLQSVVVLEGSTATFEAHISGFPVPEVSWFRDGQVI... ENSG00000155657_ENST00000591111_ENSP00000465570 Q8WZ42 1 5 26 True 34350 0 NaN NaN NaN NaN True TITIN_HUMAN 1 {5248: 'C'} 1 0 1 1 0 0 0 0 1.0 0.0 97 TTN 5 5

TITIN GENE does not have a distance score or dynamic slope...too large and failed.

In [79]:
describeMe(share_mer)
(44305, 38)
Index(['ENSPv', 'ENSP', 'ENSTv', 'ENST', 'ENSGv', 'ENSG', 'Length',
       'proSequence', 'stableID_key', 'ID', 'pro_ver', 'tx_ver', 'gen_ver',
       'identical_2UKBseq', 'Length_UKB', 'len_ensp_minus_ukb',
       'hamming_distance', 'hamming_normalized_dist', 'levenshtein_distance',
       'levenshtein_normalized_dist', 'key_8861', 'entryName',
       'labeled_pos_count', 'pos_dict', 'count_C_targets', 'count_K_targets',
       'found_count', 'found_count_C', 'found_count_K', 'missed_count',
       'missed_count_C', 'missed_count_K', 'correct_frac', 'missed_frac',
       'release', 'HGNC_gene', 'dynamic_slope_LEVENnorm',
       'dynamic_slope_HAMMINGnorm'],
      dtype='object')
ENSPv                           0
ENSP                            0
ENSTv                           0
ENST                            0
ENSGv                           0
ENSG                            0
Length                          0
proSequence                     0
stableID_key                    0
ID                              0
pro_ver                         0
tx_ver                          0
gen_ver                         0
identical_2UKBseq               0
Length_UKB                      0
len_ensp_minus_ukb              0
hamming_distance               10
hamming_normalized_dist        10
levenshtein_distance           10
levenshtein_normalized_dist    10
key_8861                        0
entryName                       0
labeled_pos_count               0
pos_dict                        0
count_C_targets                 0
count_K_targets                 0
found_count                     0
found_count_C                   0
found_count_K                   0
missed_count                    0
missed_count_C                  0
missed_count_K                  0
correct_frac                    0
missed_frac                     0
release                         0
HGNC_gene                      10
dynamic_slope_LEVENnorm         0
dynamic_slope_HAMMINGnorm       0
dtype: int64
In [74]:
share_mer.to_csv("LASER_dynamic_slope_leven_hamming_shared3887_44305rows.csv", index=False)
In [14]:
# ANOTHER FILE WITH LEVEN and HAMMING scores but not merged, columns added!

# dfs = [v85s, v92s, v94s, v96s, v97s]
# for i in dfs: 
#     print(i.sort_values(['stableID_key'], inplace=True))
#     print(i.head(2))

# condf.columns = ['v85_levnorm', 'v92_levnorm', 'v94_levnorm', 'v96_levnorm', 'v97_levnorm']
# condf_ham.columns = ['v85_hamnorm', 'v92_hamnorm', 'v94_hamnorm', 'v96_hamnorm', 'v97_hamnorm']
# condf.head(2)


# condf.to_csv("LASER_LEVEN_NORM_shared_8861_ENSP.csv", index=False)
# condf_ham.to_csv("LASER_HAMMING_NORM_shared_8861_ENSP.csv", index=False)
None
                  ENSPv             ENSP              ENSTv             ENST  \
4107  ENSP00000363937.5  ENSP00000363937  ENST00000374804.5  ENST00000374804   
4108  ENSP00000363940.5  ENSP00000363940  ENST00000374807.5  ENST00000374807   

                   ENSGv             ENSG  Length  \
4107  ENSG00000001497.12  ENSG00000001497     675   
4108  ENSG00000001497.12  ENSG00000001497     717   

                                            proSequence  \
4107  MSWESGAGPGLGSQGMDLVWSAWYGKCVKGKGSLPLSAHGIVVAWL...   
4108  MSWESGAGPGLGSQGMDLVWSAWYGKCVKGKGSLPLSAHGIVVAWL...   

                                         stableID_key      ID  pro_ver  \
4107  ENSG00000001497_ENST00000374804_ENSP00000363937  Q9Y4W2        5   
4108  ENSG00000001497_ENST00000374807_ENSP00000363940  Q9Y4W2        5   

      tx_ver  gen_ver  identical_2UKBseq  Length_UKB  len_ensp_minus_ukb  \
4107       5       12              False         734                 -59   
4108       5       12              False         734                 -17   

      hamming_distance  hamming_normalized_dist  levenshtein_distance  \
4107             624.0                 0.850136                  59.0   
4108             356.0                 0.485014                  17.0   

      levenshtein_normalized_dist  key_8861    entryName  labeled_pos_count  \
4107                     0.080381      True  LAS1L_HUMAN                  2   
4108                     0.023161      True  LAS1L_HUMAN                  2   

                  pos_dict  count_C_targets  count_K_targets  found_count  \
4107  {140: 'C', 456: 'C'}                2                0            0   
4108  {140: 'C', 456: 'C'}                2                0            1   

      found_count_C  found_count_K  missed_count  missed_count_C  \
4107              0              0             2               2   
4108              1              0             1               1   

      missed_count_K  correct_frac  missed_frac  release  
4107               0           0.0          1.0       85  
4108               0           0.5          0.5       85  
None
                  ENSPv             ENSP              ENSTv             ENST  \
7509  ENSP00000363937.5  ENSP00000363937  ENST00000374804.9  ENST00000374804   
7510  ENSP00000363940.5  ENSP00000363940  ENST00000374807.9  ENST00000374807   

                   ENSGv             ENSG  Length  \
7509  ENSG00000001497.16  ENSG00000001497     675   
7510  ENSG00000001497.16  ENSG00000001497     717   

                                            proSequence  \
7509  MSWESGAGPGLGSQGMDLVWSAWYGKCVKGKGSLPLSAHGIVVAWL...   
7510  MSWESGAGPGLGSQGMDLVWSAWYGKCVKGKGSLPLSAHGIVVAWL...   

                                         stableID_key      ID  pro_ver  \
7509  ENSG00000001497_ENST00000374804_ENSP00000363937  Q9Y4W2        5   
7510  ENSG00000001497_ENST00000374807_ENSP00000363940  Q9Y4W2        5   

      tx_ver  gen_ver  identical_2UKBseq  Length_UKB  len_ensp_minus_ukb  \
7509       9       16              False         734                 -59   
7510       9       16              False         734                 -17   

      hamming_distance  hamming_normalized_dist  levenshtein_distance  \
7509             624.0                 0.850136                  59.0   
7510             356.0                 0.485014                  17.0   

      levenshtein_normalized_dist  key_8861    entryName  labeled_pos_count  \
7509                     0.080381      True  LAS1L_HUMAN                  2   
7510                     0.023161      True  LAS1L_HUMAN                  2   

                  pos_dict  count_C_targets  count_K_targets  found_count  \
7509  {140: 'C', 456: 'C'}                2                0            0   
7510  {140: 'C', 456: 'C'}                2                0            1   

      found_count_C  found_count_K  missed_count  missed_count_C  \
7509              0              0             2               2   
7510              1              0             1               1   

      missed_count_K  correct_frac  missed_frac  release  
7509               0           0.0          1.0       92  
7510               0           0.5          0.5       92  
None
                  ENSPv             ENSP              ENSTv             ENST  \
7506  ENSP00000363937.5  ENSP00000363937  ENST00000374804.9  ENST00000374804   
7507  ENSP00000363940.5  ENSP00000363940  ENST00000374807.9  ENST00000374807   

                   ENSGv             ENSG  Length  \
7506  ENSG00000001497.16  ENSG00000001497     675   
7507  ENSG00000001497.16  ENSG00000001497     717   

                                            proSequence  \
7506  MSWESGAGPGLGSQGMDLVWSAWYGKCVKGKGSLPLSAHGIVVAWL...   
7507  MSWESGAGPGLGSQGMDLVWSAWYGKCVKGKGSLPLSAHGIVVAWL...   

                                         stableID_key      ID  pro_ver  \
7506  ENSG00000001497_ENST00000374804_ENSP00000363937  Q9Y4W2        5   
7507  ENSG00000001497_ENST00000374807_ENSP00000363940  Q9Y4W2        5   

      tx_ver  gen_ver  identical_2UKBseq  Length_UKB  len_ensp_minus_ukb  \
7506       9       16              False         734                 -59   
7507       9       16              False         734                 -17   

      hamming_distance  hamming_normalized_dist  levenshtein_distance  \
7506             624.0                 0.850136                  59.0   
7507             356.0                 0.485014                  17.0   

      levenshtein_normalized_dist  key_8861    entryName  labeled_pos_count  \
7506                     0.080381      True  LAS1L_HUMAN                  2   
7507                     0.023161      True  LAS1L_HUMAN                  2   

                  pos_dict  count_C_targets  count_K_targets  found_count  \
7506  {140: 'C', 456: 'C'}                2                0            0   
7507  {140: 'C', 456: 'C'}                2                0            1   

      found_count_C  found_count_K  missed_count  missed_count_C  \
7506              0              0             2               2   
7507              1              0             1               1   

      missed_count_K  correct_frac  missed_frac  release  
7506               0           0.0          1.0       94  
7507               0           0.5          0.5       94  
None
                  ENSPv             ENSP              ENSTv             ENST  \
7502  ENSP00000363937.5  ENSP00000363937  ENST00000374804.9  ENST00000374804   
7503  ENSP00000363940.5  ENSP00000363940  ENST00000374807.9  ENST00000374807   

                   ENSGv             ENSG  Length  \
7502  ENSG00000001497.16  ENSG00000001497     675   
7503  ENSG00000001497.16  ENSG00000001497     717   

                                            proSequence  \
7502  MSWESGAGPGLGSQGMDLVWSAWYGKCVKGKGSLPLSAHGIVVAWL...   
7503  MSWESGAGPGLGSQGMDLVWSAWYGKCVKGKGSLPLSAHGIVVAWL...   

                                         stableID_key      ID  pro_ver  \
7502  ENSG00000001497_ENST00000374804_ENSP00000363937  Q9Y4W2        5   
7503  ENSG00000001497_ENST00000374807_ENSP00000363940  Q9Y4W2        5   

      tx_ver  gen_ver  identical_2UKBseq  Length_UKB  len_ensp_minus_ukb  \
7502       9       16              False         734                 -59   
7503       9       16              False         734                 -17   

      hamming_distance  hamming_normalized_dist  levenshtein_distance  \
7502             624.0                 0.850136                  59.0   
7503             356.0                 0.485014                  17.0   

      levenshtein_normalized_dist  key_8861    entryName  labeled_pos_count  \
7502                     0.080381      True  LAS1L_HUMAN                  2   
7503                     0.023161      True  LAS1L_HUMAN                  2   

                  pos_dict  count_C_targets  count_K_targets  found_count  \
7502  {140: 'C', 456: 'C'}                2                0            0   
7503  {140: 'C', 456: 'C'}                2                0            1   

      found_count_C  found_count_K  missed_count  missed_count_C  \
7502              0              0             2               2   
7503              1              0             1               1   

      missed_count_K  correct_frac  missed_frac  release  
7502               0           0.0          1.0       96  
7503               0           0.5          0.5       96  
None
                  ENSPv             ENSP              ENSTv             ENST  \
7467  ENSP00000363937.5  ENSP00000363937  ENST00000374804.9  ENST00000374804   
7468  ENSP00000363940.5  ENSP00000363940  ENST00000374807.9  ENST00000374807   

                   ENSGv             ENSG  Length  \
7467  ENSG00000001497.16  ENSG00000001497     675   
7468  ENSG00000001497.16  ENSG00000001497     717   

                                            proSequence  \
7467  MSWESGAGPGLGSQGMDLVWSAWYGKCVKGKGSLPLSAHGIVVAWL...   
7468  MSWESGAGPGLGSQGMDLVWSAWYGKCVKGKGSLPLSAHGIVVAWL...   

                                         stableID_key      ID  pro_ver  \
7467  ENSG00000001497_ENST00000374804_ENSP00000363937  Q9Y4W2        5   
7468  ENSG00000001497_ENST00000374807_ENSP00000363940  Q9Y4W2        5   

      tx_ver  gen_ver  identical_2UKBseq  Length_UKB  len_ensp_minus_ukb  \
7467       9       16              False         734                 -59   
7468       9       16              False         734                 -17   

      hamming_distance  hamming_normalized_dist  levenshtein_distance  \
7467             624.0                 0.850136                  59.0   
7468             356.0                 0.485014                  17.0   

      levenshtein_normalized_dist  key_8861    entryName  labeled_pos_count  \
7467                     0.080381      True  LAS1L_HUMAN                  2   
7468                     0.023161      True  LAS1L_HUMAN                  2   

                  pos_dict  count_C_targets  count_K_targets  found_count  \
7467  {140: 'C', 456: 'C'}                2                0            0   
7468  {140: 'C', 456: 'C'}                2                0            1   

      found_count_C  found_count_K  missed_count  missed_count_C  \
7467              0              0             2               2   
7468              1              0             1               1   

      missed_count_K  correct_frac  missed_frac  release  
7467               0           0.0          1.0       97  
7468               0           0.5          0.5       97  









back to merged df ...




CLARIFICATION ON DIFFERENT LEVELS OF FRAC MISSED:

  • EACH RELEASE has a frac-missed score calculated for all ENSP mapping to 3953.

    • This was used to create box plots, which have the frac missed plotted for each ENSP in that release.
    • CONCERN that different number of ENSP in each release is confounding
    • CYS and LYS specific box plot also plotted for each ENSP mapping to ukb ID with a detected C or K. The plots show n = ukb ID, but there are differences among number of ENSP in each release.
  • GROUPBY version created for each release (only 3953 rows total)

    • grouped on uniprot ID, and counted frac missed as (total number positions missed for that ID/ total pos searched).
    • in the case of A0AVT1, there are 2 ENSP linked in each release, each ENSP had score of 9/11 or 0/11, the group by fraction for each release is therefore = 9/22 or 0.409
  • RELEASE LEVEL SCORE: takes the fraction missed score for 3953 uniprot IDs in each release and averages these to create the release level average

    • i am taking the fraction missed from each release (ex. 0.409 for 85,92,94,96,97 for A0AVT1 and averaging these values...0.409*5/5 = 0.409)

[done in markdown J] another way to calculate FRAC MISSED:

only include unique ENSP sequences mapping to same 3953 UKB IDs.

  1. each release file with diff rows same UKB IDs, group by ID, set(proSequence), calculate the fraction missed as

    total number missed in all unique seqeunce mapping to 1 ukb ID / total positions searched per UKB ID * number sequences searched

  2. plot these values, each release will have 3953 scores from only unique ensp sequences mapping.

  3. release level would be created by averaging all fraction missed scores for 3953 in each release



creating Release-level (all 5 releases averaged into 1 frac missed score) file to FIND UKB ID with WORST FRACTION MISSED SCORES!

In [33]:
# make python read col as list
mer.frac_missed = mer.frac_missed.apply(literal_eval)
In [34]:
mer['avg_frac_missed_releaseLevel'] = mer['frac_missed'].apply(lambda x: mean(x))
# creating all releases avg score
In [35]:
mer.reset_index(inplace=True)
mer.columns = ['ID', 'ENSP', 'ENSPv', 'ENST', 'ENSTv', 'ENSG', 'ENSGv',
       'stableID_key', 'proSequence', 'count_ENSP', 'count_ENSPv',
       'count_ENST', 'count_ENSTv', 'count_ENSG', 'count_ENSGv',
       'count_stableID_key', 'count_proSequence', 'Length', 'pro_ver',
       'tx_ver', 'gen_ver', 'stdev_length', 'stdev_prov', 'stdev_txv',
       'stdev_genv', 'labeled_pos_count', 'frac_missed',
       'dynamic_slope_scores', 'avg_frac_missed_releaseLevel']
mer.head(3)
Out[35]:
ID ENSP ENSPv ENST ENSTv ENSG ENSGv stableID_key proSequence count_ENSP count_ENSPv count_ENST count_ENSTv count_ENSG count_ENSGv count_stableID_key count_proSequence Length pro_ver tx_ver gen_ver stdev_length stdev_prov stdev_txv stdev_genv labeled_pos_count frac_missed dynamic_slope_scores avg_frac_missed_releaseLevel
0 A0AVT1 {'ENSP00000313454', 'ENSP00000399234'} {'ENSP00000313454.4', 'ENSP00000399234.2'} {'ENST00000322244', 'ENST00000420827'} {'ENST00000420827.2', 'ENST00000322244.5', 'EN... {'ENSG00000033178'} {'ENSG00000033178.13', 'ENSG00000033178.8', 'E... {'ENSG00000033178_ENST00000420827_ENSP00000399... {'MEGSEPVAAHQGEEASCSSWGTGSTNKNLPIMSTASVEIDDALY... 2 2 2 4 1 3 2 2 [1052, 389, 1052, 389, 1052, 389, 1052, 389, 1... [4, 2, 4, 2, 4, 2, 4, 2, 4, 2] [5, 2, 9, 2, 9, 2, 10, 2, 10, 2] [8, 8, 12, 12, 12, 12, 13, 13, 13, 13] 349.431681 1.054093 3.743142 1.955050 11 [0.4090909090909091, 0.4090909090909091, 0.409... 1 0.409091
1 A0FGR8 {'ENSP00000251527'} {'ENSP00000251527.5', 'ENSP00000251527.6'} {'ENST00000251527'} {'ENST00000251527.10', 'ENST00000251527.5', 'E... {'ENSG00000117868'} {'ENSG00000117868.16', 'ENSG00000117868.11', '... {'ENSG00000117868_ENST00000251527_ENSP00000251... {'MTPPSRAEAGVRRSRVPSEGRWRGAEPPGISASTQPASAGRAAR... 1 2 1 3 1 3 1 2 [893, 893, 893, 845, 845] [5, 5, 5, 6, 6] [5, 9, 9, 10, 10] [11, 15, 15, 16, 16] 26.290683 0.547723 2.073644 2.073644 1 [1.0, 1.0, 1.0, 1.0, 1.0] 1 1.000000
2 A0JNW5 {'ENSP00000349285', 'ENSP00000444824', 'ENSP00... {'ENSP00000444824.2', 'ENSP00000349285.3', 'EN... {'ENST00000545232', 'ENST00000356828', 'ENST00... {'ENST00000545232.2', 'ENST00000279907.11', 'E... {'ENSG00000111647'} {'ENSG00000111647.13', 'ENSG00000111647.12', '... {'ENSG00000111647_ENST00000279907_ENSP00000279... {'MAGIIKKQILKHLSRFTKNLSPDKINLSTLKGEGELKNLELDEE... 3 3 3 6 1 3 3 3 [1464, 522, 1114, 1464, 522, 1464, 522, 1464, ... [7, 3, 2, 7, 3, 7, 3, 7, 3, 7, 3] [7, 3, 2, 11, 7, 11, 7, 12, 7, 12, 7] [8, 8, 8, 12, 12, 12, 12, 13, 13, 13, 13] 472.410838 2.195036 3.400535 2.148996 1 [0.6666666666666666, 0.5, 0.5, 0.5, 0.5] 2 0.533333
In [36]:
# adding gene name column 
mer['HGNC_gene'] = mer['ID']
mer.HGNC_gene = mer.HGNC_gene.map(ref_gene)
mer.head(2)
Out[36]:
ID ENSP ENSPv ENST ENSTv ENSG ENSGv stableID_key proSequence count_ENSP count_ENSPv count_ENST count_ENSTv count_ENSG count_ENSGv count_stableID_key count_proSequence Length pro_ver tx_ver gen_ver stdev_length stdev_prov stdev_txv stdev_genv labeled_pos_count frac_missed dynamic_slope_scores avg_frac_missed_releaseLevel HGNC_gene
0 A0AVT1 {'ENSP00000313454', 'ENSP00000399234'} {'ENSP00000313454.4', 'ENSP00000399234.2'} {'ENST00000322244', 'ENST00000420827'} {'ENST00000420827.2', 'ENST00000322244.5', 'EN... {'ENSG00000033178'} {'ENSG00000033178.13', 'ENSG00000033178.8', 'E... {'ENSG00000033178_ENST00000420827_ENSP00000399... {'MEGSEPVAAHQGEEASCSSWGTGSTNKNLPIMSTASVEIDDALY... 2 2 2 4 1 3 2 2 [1052, 389, 1052, 389, 1052, 389, 1052, 389, 1... [4, 2, 4, 2, 4, 2, 4, 2, 4, 2] [5, 2, 9, 2, 9, 2, 10, 2, 10, 2] [8, 8, 12, 12, 12, 12, 13, 13, 13, 13] 349.431681 1.054093 3.743142 1.955050 11 [0.4090909090909091, 0.4090909090909091, 0.409... 1 0.409091 UBA6
1 A0FGR8 {'ENSP00000251527'} {'ENSP00000251527.5', 'ENSP00000251527.6'} {'ENST00000251527'} {'ENST00000251527.10', 'ENST00000251527.5', 'E... {'ENSG00000117868'} {'ENSG00000117868.16', 'ENSG00000117868.11', '... {'ENSG00000117868_ENST00000251527_ENSP00000251... {'MTPPSRAEAGVRRSRVPSEGRWRGAEPPGISASTQPASAGRAAR... 1 2 1 3 1 3 1 2 [893, 893, 893, 845, 845] [5, 5, 5, 6, 6] [5, 9, 9, 10, 10] [11, 15, 15, 16, 16] 26.290683 0.547723 2.073644 2.073644 1 [1.0, 1.0, 1.0, 1.0, 1.0] 1 1.000000 ESYT2
In [117]:
checkColumnValues(mer, 'dynamic_slope_scores')
   dynamic_slope_scores  Count
0                     1   3051
1                     2    848
2                     3     43
3                     4     11
In [37]:
checkColumnValues(mer, 'avg_frac_missed_releaseLevel')
     avg_frac_missed_releaseLevel  Count
0                        0.000000   2157
1                        0.500000    289
2                        0.400000    140
3                        0.333333     98
4                        0.666667     90
5                        0.250000     72
6                        0.533333     53
7                        0.100000     46
8                        0.266667     44
9                        0.200000     33
10                       0.600000     31
11                       0.750000     30
12                       0.166667     29
13                       0.466667     25
14                       0.633333     23
15                       0.133333     20
16                       0.366667     19
17                       1.000000     17
18                       0.375000     15
19                       0.700000     15
20                       0.683333     13
21                       0.066667     13
22                       0.800000     11
23                       0.760000     10
24                       0.733333     10
25                       0.050000      9
26                       0.416667      9
27                       0.320000      9
28                       0.125000      9
29                       0.111111      9
..                            ...    ...
375                      0.416667      1
376                      0.356364      1
377                      0.874711      1
378                      0.862069      1
379                      0.439216      1
380                      0.660000      1
381                      0.678571      1
382                      0.181818      1
383                      0.655556      1
384                      0.631579      1
385                      0.066667      1
386                      0.539394      1
387                      0.359259      1
388                      0.293333      1
389                      0.651852      1
390                      0.772222      1
391                      0.728889      1
392                      0.273333      1
393                      0.156710      1
394                      0.341667      1
395                      0.107143      1
396                      0.513333      1
397                      0.151515      1
398                      0.058824      1
399                      0.291667      1
400                      0.402500      1
401                      0.770000      1
402                      0.370370      1
403                      0.057143      1
404                      0.527273      1

[405 rows x 2 columns]

17 uniprot IDs have a missed fraction score of 1!!! which IDs are these?

In [39]:
worst17 = mer[mer['avg_frac_missed_releaseLevel'] == 1] 
worst17.to_csv("WORST17_avg_frac_missed_allReleases_equals_1.csv", index=False)
In [44]:
# checking DMD gene
ultimate = pd.read_csv("/Users/mariapalafox/Box Sync/CODE_DATA/dir_MAPpaper/TSV_UNIPROT_xref/MULTIMAPPING_IDS/ultimate_UKB_sequence_key_3953.csv")
DMDukb = ultimate[ultimate['ID'] == 'P11532']
DMDensp = worst17[worst17['ID'] == 'P11532']
DMDensp.to_csv("DMDensp_5releases_merge.csv")
DMDukb.to_csv("DMDukb_canonical.csv")
In [48]:
mer.describe()
Out[48]:
count_ENSP count_ENSPv count_ENST count_ENSTv count_ENSG count_ENSGv count_stableID_key count_proSequence stdev_length stdev_prov stdev_txv stdev_genv labeled_pos_count dynamic_slope_scores avg_frac_missed_releaseLevel
count 3953.000000 3953.000000 3953.000000 3953.000000 3953.000000 3953.000000 3953.000000 3953.000000 3953.000000 3953.000000 3953.000000 3953.000000 3953.000000 3953.000000 3953.000000
mean 3.022009 3.064002 3.022009 5.695674 1.156084 2.906906 3.026309 2.051859 48.570988 0.914505 2.130183 2.018254 3.860106 1.244624 0.197596
std 2.927908 2.989557 2.927908 4.621145 0.903028 1.416515 2.941796 1.463833 227.805666 0.885192 0.886021 0.968626 5.027748 0.472537 0.257774
min 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 0.000000 0.000000 0.000000 0.000000 1.000000 1.000000 0.000000
25% 1.000000 1.000000 1.000000 3.000000 1.000000 2.000000 1.000000 1.000000 0.000000 0.000000 1.788854 1.686548 1.000000 1.000000 0.000000
50% 2.000000 2.000000 2.000000 4.000000 1.000000 3.000000 2.000000 2.000000 5.367450 0.845154 2.073644 1.955050 2.000000 1.000000 0.000000
75% 4.000000 4.000000 4.000000 7.000000 1.000000 3.000000 4.000000 3.000000 34.785054 1.581139 2.567763 2.073644 5.000000 1.000000 0.400000
max 47.000000 47.000000 47.000000 63.000000 24.000000 27.000000 47.000000 22.000000 11010.110338 4.743416 5.775908 8.779346 112.000000 4.000000 1.000000
In [49]:
all_mer.describe()
Out[49]:
Length pro_ver tx_ver gen_ver Length_UKB len_ensp_minus_ukb hamming_distance hamming_normalized_dist levenshtein_distance levenshtein_normalized_dist labeled_pos_count count_C_targets count_K_targets found_count found_count_C found_count_K missed_count missed_count_C missed_count_K correct_frac missed_frac release
count 52417.000000 52417.000000 52417.000000 52417.000000 52417.000000 52417.000000 52402.000000 52402.000000 52402.000000 52402.000000 52417.000000 52417.000000 52417.000000 52417.000000 52417.000000 52417.000000 52417.000000 52417.000000 52417.000000 52417.00000 52417.00000 52417.000000
mean 586.708205 2.711315 5.495259 12.473568 645.700326 -58.992121 213.085894 0.276304 67.538567 0.082528 3.938741 1.589942 2.348799 2.834367 1.095313 1.739054 1.104375 0.494630 0.609745 0.71844 0.28156 92.866417
std 886.522311 1.818620 2.769002 4.487798 1033.983792 421.317402 724.515693 0.379610 422.775705 0.168112 5.222083 1.994442 4.047183 4.385355 1.709544 3.370514 3.256925 1.305212 2.305362 0.42658 0.42658 4.227520
min 35.000000 1.000000 1.000000 1.000000 56.000000 -34137.000000 0.000000 0.000000 0.000000 0.000000 1.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.00000 0.00000 85.000000
25% 246.000000 1.000000 4.000000 10.000000 279.000000 -30.000000 0.000000 0.000000 0.000000 0.000000 1.000000 0.000000 0.000000 1.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.25000 0.00000 92.000000
50% 427.000000 2.000000 5.000000 13.000000 464.000000 0.000000 0.000000 0.000000 0.000000 0.000000 2.000000 1.000000 1.000000 1.000000 1.000000 1.000000 0.000000 0.000000 0.000000 1.00000 0.00000 94.000000
75% 706.000000 4.000000 7.000000 16.000000 758.000000 0.000000 274.000000 0.634409 41.000000 0.084656 5.000000 2.000000 3.000000 3.000000 1.000000 2.000000 1.000000 0.000000 0.000000 1.00000 0.75000 96.000000
max 35991.000000 11.000000 16.000000 26.000000 34350.000000 1856.000000 34325.000000 0.999272 34137.000000 0.993799 112.000000 25.000000 105.000000 112.000000 25.000000 105.000000 110.000000 22.000000 104.000000 1.00000 1.00000 97.000000
In [50]:
share_mer.describe()
Out[50]:
Length pro_ver tx_ver gen_ver Length_UKB len_ensp_minus_ukb hamming_distance hamming_normalized_dist levenshtein_distance levenshtein_normalized_dist labeled_pos_count count_C_targets count_K_targets found_count found_count_C found_count_K missed_count missed_count_C missed_count_K correct_frac missed_frac release
count 44305.000000 44305.000000 44305.000000 44305.000000 44305.000000 44305.000000 44295.000000 44295.000000 44295.000000 44295.000000 44305.000000 44305.000000 44305.000000 44305.000000 44305.000000 44305.000000 44305.000000 44305.000000 44305.000000 44305.000000 44305.000000 44305.000000
mean 580.045999 2.909988 5.847489 12.788715 634.320167 -54.274168 191.992731 0.249337 60.869534 0.073434 3.909265 1.582891 2.326374 2.934906 1.145063 1.789843 0.974360 0.437829 0.536531 0.746434 0.253566 92.800000
std 834.200035 1.829207 2.695173 4.105933 995.109689 405.361745 716.180095 0.367390 407.394863 0.160723 5.260836 1.982691 4.090135 4.485708 1.732094 3.459330 3.129499 1.239282 2.225835 0.413222 0.413222 4.261503
min 35.000000 1.000000 1.000000 1.000000 56.000000 -28746.000000 0.000000 0.000000 0.000000 0.000000 1.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 85.000000
25% 247.000000 1.000000 5.000000 10.000000 276.000000 -21.000000 0.000000 0.000000 0.000000 0.000000 1.000000 0.000000 0.000000 1.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.500000 0.000000 92.000000
50% 428.000000 3.000000 6.000000 13.000000 461.000000 0.000000 0.000000 0.000000 0.000000 0.000000 2.000000 1.000000 1.000000 1.000000 1.000000 1.000000 0.000000 0.000000 0.000000 1.000000 0.000000 94.000000
75% 704.000000 4.000000 8.000000 16.000000 755.000000 0.000000 227.500000 0.532243 34.000000 0.064220 5.000000 2.000000 3.000000 3.000000 1.000000 2.000000 1.000000 0.000000 0.000000 1.000000 0.500000 96.000000
max 34350.000000 11.000000 16.000000 26.000000 34350.000000 821.000000 32142.000000 0.997551 28825.000000 0.974703 112.000000 25.000000 105.000000 112.000000 25.000000 105.000000 110.000000 22.000000 104.000000 1.000000 1.000000 97.000000
In [51]:
false_mer.describe()
Out[51]:
Length pro_ver tx_ver gen_ver Length_UKB len_ensp_minus_ukb hamming_distance hamming_normalized_dist levenshtein_distance levenshtein_normalized_dist labeled_pos_count count_C_targets count_K_targets found_count found_count_C found_count_K missed_count missed_count_C missed_count_K correct_frac missed_frac release
count 20720.000000 20720.00000 20720.000000 20720.000000 20720.000000 20720.000000 20714.000000 20714.000000 20714.000000 20714.000000 20720.000000 20720.000000 20720.000000 20720.000000 20720.000000 20720.000000 20720.000000 20720.000000 20720.000000 20720.000000 20720.000000 20720.000000
mean 646.344836 2.44527 5.173793 13.761776 785.040830 -138.695994 505.936130 0.640005 159.209134 0.188264 3.752896 1.642712 2.110183 1.251931 0.514624 0.737307 2.500965 1.128089 1.372876 0.350023 0.649977 92.789189
std 1034.230752 1.80544 2.650819 4.300324 1324.641193 655.956222 1076.933413 0.316329 655.656605 0.210873 5.635830 2.093269 4.253159 2.896054 1.173241 2.140163 4.578587 1.775982 3.354752 0.425046 0.425046 4.277971
min 35.000000 1.00000 1.000000 1.000000 56.000000 -34137.000000 1.000000 0.000300 1.000000 0.000242 1.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 85.000000
25% 280.000000 1.00000 3.000000 11.000000 361.000000 -108.000000 151.000000 0.376454 23.000000 0.037583 1.000000 1.000000 0.000000 0.000000 0.000000 0.000000 1.000000 0.000000 0.000000 0.000000 0.090000 92.000000
50% 467.000000 2.00000 5.000000 14.000000 559.000000 -40.000000 339.000000 0.749322 50.000000 0.105425 2.000000 1.000000 1.000000 0.000000 0.000000 0.000000 1.000000 1.000000 0.000000 0.000000 1.000000 94.000000
75% 759.000000 3.00000 7.000000 17.000000 900.000000 -7.000000 606.000000 0.934480 120.000000 0.257636 4.000000 2.000000 2.000000 1.000000 1.000000 1.000000 3.000000 1.000000 1.000000 0.910000 1.000000 96.000000
max 35991.000000 11.00000 15.000000 26.000000 34350.000000 1856.000000 34325.000000 0.999272 34137.000000 0.993799 112.000000 25.000000 105.000000 48.000000 18.000000 33.000000 110.000000 22.000000 104.000000 1.000000 1.000000 97.000000

warning: describe tables count values is less for distance metric columns, double check merge was successful!

  • all ensp spec file has delta 15 less distance scores
  • shared spec file has delta 10
  • false spec file has delta 6
In [66]:
check = false_mer['hamming_normalized_dist'].value_counts()
In [68]:
check.to_csv("check_FALSE_distance_20720.csv")

saving files

In [52]:
mer.drop(['proSequence'], axis=1, inplace=True)
In [54]:
mer.to_csv("RELEASE_LEVEL_3953_avgFRACMISSED_genename_3953rows.csv",index=False)
In [55]:
all_mer.to_csv("ENSP_LEVEL_all3953_genename_distanceMetrics_52417rows.csv",index=False)
In [56]:
share_mer.to_csv("ENSP_LEVEL_shared3887_genename_distanceMetrics_44305rows.csv",index=False)
In [57]:
false_mer.to_csv("ENSP_LEVEL_false1796_genename_distanceMetrics_20720rows.csv",index=False)



VERSION INCREMENT TOP IDS FOR PROTEIN TRANSCRIPT GENE

  1. tried using standard deviation of version IDs (not clear)
  2. using range of version numbers from 5 releases for same version ID
In [5]:
os.chdir("/Users/mariapalafox/Box Sync/CODE_DATA/dir_MAPpaper/TSV_UNIPROT_xref/MULTIMAPPING_IDS/MISMAP2.0/")

1. STD method

In [5]:
mer = pd.read_csv("RELEASE_LEVEL_3953_avgFRACMISSED_genename_3953rows.csv")
mer.head(2)
Out[5]:
ID ENSP ENSPv ENST ENSTv ENSG ENSGv stableID_key count_ENSP count_ENSPv count_ENST count_ENSTv count_ENSG count_ENSGv count_stableID_key count_proSequence Length pro_ver tx_ver gen_ver stdev_length stdev_prov stdev_txv stdev_genv labeled_pos_count frac_missed dynamic_slope_scores avg_frac_missed_releaseLevel HGNC_gene
0 A0AVT1 {'ENSP00000313454', 'ENSP00000399234'} {'ENSP00000313454.4', 'ENSP00000399234.2'} {'ENST00000322244', 'ENST00000420827'} {'ENST00000420827.2', 'ENST00000322244.5', 'EN... {'ENSG00000033178'} {'ENSG00000033178.13', 'ENSG00000033178.8', 'E... {'ENSG00000033178_ENST00000420827_ENSP00000399... 2 2 2 4 1 3 2 2 [1052, 389, 1052, 389, 1052, 389, 1052, 389, 1... [4, 2, 4, 2, 4, 2, 4, 2, 4, 2] [5, 2, 9, 2, 9, 2, 10, 2, 10, 2] [8, 8, 12, 12, 12, 12, 13, 13, 13, 13] 349.431681 1.054093 3.743142 1.955050 11 [0.4090909090909091, 0.4090909090909091, 0.409... 1 0.409091 UBA6
1 A0FGR8 {'ENSP00000251527'} {'ENSP00000251527.5', 'ENSP00000251527.6'} {'ENST00000251527'} {'ENST00000251527.10', 'ENST00000251527.5', 'E... {'ENSG00000117868'} {'ENSG00000117868.16', 'ENSG00000117868.11', '... {'ENSG00000117868_ENST00000251527_ENSP00000251... 1 2 1 3 1 3 1 2 [893, 893, 893, 845, 845] [5, 5, 5, 6, 6] [5, 9, 9, 10, 10] [11, 15, 15, 16, 16] 26.290683 0.547723 2.073644 2.073644 1 [1.0, 1.0, 1.0, 1.0, 1.0] 1 1.000000 ESYT2
In [6]:
# TOP 25 for TX PRO GENE biotypes

#mer.sort_values(['stdev_prov'], ascending=False,inplace=True)
#mer.sort_values(['stdev_txv'], ascending=False,inplace=True)
#mer.sort_values(['stdev_genv'], ascending=False,inplace=True)

# grab top 25 for each sorted biotype file
#top25std = mer[0:25]

# saving
#top25std.to_csv("STD_ENSPv_top25_3953RELEASE_level.csv", index=False)
#top25std.to_csv("STD_ENSTv_top25_3953RELEASE_level.csv", index=False)
#top25std.to_csv("STD_ENSGv_top25_3953RELEASE_level.csv", index=False)

2. range of version values for same stable ID keys, range of 5 version numbers from 5 releases

In [7]:
stable = pd.read_csv("ENSP_LEVEL_shared3887_genename_distanceMetrics_44305rows.csv")
stable.columns
Out[7]:
Index(['ENSPv', 'ENSP', 'ENSTv', 'ENST', 'ENSGv', 'ENSG', 'Length',
       'proSequence', 'stableID_key', 'ID', 'pro_ver', 'tx_ver', 'gen_ver',
       'identical_2UKBseq', 'Length_UKB', 'len_ensp_minus_ukb',
       'hamming_distance', 'hamming_normalized_dist', 'levenshtein_distance',
       'levenshtein_normalized_dist', 'key_8861', 'entryName',
       'labeled_pos_count', 'pos_dict', 'count_C_targets', 'count_K_targets',
       'found_count', 'found_count_C', 'found_count_K', 'missed_count',
       'missed_count_C', 'missed_count_K', 'correct_frac', 'missed_frac',
       'release', 'HGNC_gene'],
      dtype='object')
In [8]:
slimstable = stable[['ENSPv', 'ENSTv', 'ENSGv', 'Length',
       'stableID_key', 'ID', 'pro_ver', 'tx_ver', 'gen_ver',
       'identical_2UKBseq', 'len_ensp_minus_ukb',
       'hamming_normalized_dist', 
       'levenshtein_normalized_dist','release', 'HGNC_gene']].copy()
In [9]:
slimstable.sort_values(by=['stableID_key'], inplace=True)
In [10]:
slimstable.head(3)
Out[10]:
ENSPv ENSTv ENSGv Length stableID_key ID pro_ver tx_ver gen_ver identical_2UKBseq len_ensp_minus_ukb hamming_normalized_dist levenshtein_normalized_dist release HGNC_gene
4107 ENSP00000363937.5 ENST00000374804.5 ENSG00000001497.12 675 ENSG00000001497_ENST00000374804_ENSP00000363937 Q9Y4W2 5 5 12 False -59 0.850136 0.080381 85 LAS1L
16370 ENSP00000363937.5 ENST00000374804.9 ENSG00000001497.16 675 ENSG00000001497_ENST00000374804_ENSP00000363937 Q9Y4W2 5 9 16 False -59 0.850136 0.080381 92 LAS1L
34085 ENSP00000363937.5 ENST00000374804.9 ENSG00000001497.16 675 ENSG00000001497_ENST00000374804_ENSP00000363937 Q9Y4W2 5 9 16 False -59 0.850136 0.080381 96 LAS1L
In [20]:
def rangefinder(s):
    maxx = max(s)
    minn = min(s)
    diff = maxx - minn
    return diff

# group by stableID_key make 3 seperate columns from list of pro_ver, tx_ver, gene_ver 
gene_stable = slimstable.groupby('stableID_key')['gen_ver'].apply(list)
pro_stable = slimstable.groupby('stableID_key')['pro_ver'].apply(list)
tx_stable = slimstable.groupby('stableID_key')['tx_ver'].apply(list)

# concat
list_columns = pd.concat([gene_stable, tx_stable, pro_stable], axis=1)

# creating column for number of ENSP ids
list_columns['range_gene_ver'] = list_columns['gen_ver'].apply(rangefinder)
list_columns['range_tx_ver'] = list_columns['tx_ver'].apply(rangefinder)
list_columns['range_pro_ver'] = list_columns['pro_ver'].apply(rangefinder)
In [25]:
list_columns.reset_index(inplace=True)
list_columns.head(3)
Out[25]:
stableID_key gen_ver tx_ver pro_ver range_gene_ver range_tx_ver range_pro_ver
0 ENSG00000001497_ENST00000374804_ENSP00000363937 [12, 16, 16, 16, 16] [5, 9, 9, 9, 9] [5, 5, 5, 5, 5] 4 4 0
1 ENSG00000001497_ENST00000374807_ENSP00000363940 [16, 12, 16, 16, 16] [9, 5, 9, 9, 9] [5, 5, 5, 5, 5] 4 4 0
2 ENSG00000001497_ENST00000374811_ENSP00000363944 [16, 16, 16, 12, 16] [7, 7, 7, 3, 7] [3, 3, 3, 3, 3] 4 4 0
In [26]:
refhgnc = stable[['stableID_key','HGNC_gene']].copy()
refhgnc.drop_duplicates(inplace=True)
refhgnc.head(3)
Out[26]:
stableID_key HGNC_gene
0 ENSG00000004059_ENST00000000233_ENSP00000000233 ARF5
1 ENSG00000004478_ENST00000001008_ENSP00000001008 FKBP4
2 ENSG00000048028_ENST00000003302_ENSP00000003302 USP28
In [27]:
# mapping these columns to df with dic
ref_hgnc = dict(zip(refhgnc.stableID_key, refhgnc.HGNC_gene))
# adding gene name column 
list_columns['HGNC_gene'] = list_columns['stableID_key']
list_columns.HGNC_gene = list_columns.HGNC_gene.map(ref_hgnc)
list_columns.head(3)
Out[27]:
stableID_key gen_ver tx_ver pro_ver range_gene_ver range_tx_ver range_pro_ver HGNC_gene
0 ENSG00000001497_ENST00000374804_ENSP00000363937 [12, 16, 16, 16, 16] [5, 9, 9, 9, 9] [5, 5, 5, 5, 5] 4 4 0 LAS1L
1 ENSG00000001497_ENST00000374807_ENSP00000363940 [16, 12, 16, 16, 16] [9, 5, 9, 9, 9] [5, 5, 5, 5, 5] 4 4 0 LAS1L
2 ENSG00000001497_ENST00000374811_ENSP00000363944 [16, 16, 16, 12, 16] [7, 7, 7, 3, 7] [3, 3, 3, 3, 3] 4 4 0 LAS1L
In [29]:
list_columns.describe()
Out[29]:
range_gene_ver range_tx_ver range_pro_ver
count 8861.000000 8861.000000 8861.000000
mean 4.676560 3.718429 0.014445
std 1.094364 1.448533 0.122129
min 0.000000 0.000000 0.000000
25% 4.000000 4.000000 0.000000
50% 5.000000 4.000000 0.000000
75% 5.000000 4.000000 0.000000
max 10.000000 7.000000 2.000000
In [35]:
# TOP 25 for TX PRO GENE biotypes

#list_columns.sort_values(['range_gene_ver'], ascending=False,inplace=True)
#list_columns.sort_values(['range_tx_ver'], ascending=False,inplace=True)
list_columns.sort_values(['range_pro_ver'], ascending=False,inplace=True)

# grab top 25 for each sorted biotype file
top25 = list_columns[0:25]

# saving
#top25.to_csv("RANGE_ENSGv_top25_8861stableKEY_level.csv", index=False)
#top25.to_csv("RANGE_ENSTv_top25_8861stableKEY_level.csv", index=False)
top25.to_csv("RANGE_ENSPv_top25_8861stableKEY_level.csv", index=False)
In [36]:
top25
Out[36]:
stableID_key gen_ver tx_ver pro_ver range_gene_ver range_tx_ver range_pro_ver HGNC_gene
7882 ENSG00000196235_ENST00000432763_ENSP00000404029 [14, 14, 13, 13, 9] [7, 7, 6, 6, 2] [4, 4, 4, 4, 2] 5 5 2 SUPT5H
7932 ENSG00000196642_ENST00000371663_ENSP00000360727 [19, 11, 18, 19, 18] [10, 4, 10, 10, 10] [6, 4, 6, 6, 6] 8 6 2 RABL6
6771 ENSG00000168918_ENST00000359570_ENSP00000352575 [13, 14, 13, 9, 14] [9, 9, 9, 5, 9] [7, 7, 7, 5, 7] 5 4 2 INPP5D
6002 ENSG00000160211_ENST00000393562_ENSP00000377192 [16, 18, 11, 17, 17] [7, 9, 2, 8, 8] [2, 3, 2, 3, 3] 7 7 1 G6PD
4648 ENSG00000138326_ENST00000372360_ENSP00000361435 [20, 19, 14, 19, 19] [8, 8, 3, 8, 8] [4, 4, 3, 4, 4] 6 5 1 RPS24
8836 ENSG00000262771_ENST00000571430_ENSP00000459367 [5, 5, 5, 5, 1] [5, 5, 5, 5, 1] [2, 2, 2, 2, 1] 4 4 1 SSBP1
6851 ENSG00000169756_ENST00000338045_ENSP00000337598 [12, 16, 16, 16, 16] [3, 7, 7, 7, 7] [3, 4, 4, 4, 4] 4 4 1 LIMS1
822 ENSG00000075856_ENST00000228284_ENSP00000228284 [7, 11, 11, 12, 12] [3, 7, 7, 8, 8] [3, 3, 3, 4, 4] 5 5 1 SART3
3104 ENSG00000117868_ENST00000251527_ENSP00000251527 [15, 16, 15, 16, 11] [9, 10, 9, 10, 5] [5, 6, 5, 6, 5] 5 5 1 ESYT2
2765 ENSG00000114346_ENST00000540509_ENSP00000443160 [9, 13, 13, 13, 13] [1, 5, 5, 5, 5] [1, 2, 2, 2, 2] 4 4 1 ECT2
2583 ENSG00000112118_ENST00000229854_ENSP00000229854 [18, 19, 13, 18, 19] [11, 12, 7, 11, 12] [5, 6, 5, 5, 6] 6 5 1 MCM3
2012 ENSG00000105221_ENST00000424901_ENSP00000399532 [16, 17, 12, 17, 16] [5, 5, 1, 5, 5] [2, 2, 1, 2, 2] 5 4 1 AKT2
7705 ENSG00000185420_ENST00000490107_ENSP00000419184 [18, 14, 19, 19, 18] [5, 1, 6, 6, 5] [2, 1, 2, 2, 2] 5 5 1 SMYD3
1766 ENSG00000102309_ENST00000373669_ENSP00000362773 [8, 12, 12, 13, 14] [2, 6, 6, 6, 7] [2, 2, 2, 2, 3] 6 5 1 PIN4
7530 ENSG00000182253_ENST00000336292_ENSP00000336775 [14, 10, 14, 14, 14] [10, 6, 10, 10, 10] [7, 6, 7, 7, 7] 4 4 1 SYNM
6010 ENSG00000160679_ENST00000368690_ENSP00000357679 [13, 12, 8, 13, 12] [7, 7, 3, 7, 7] [4, 4, 3, 4, 4] 5 4 1 CHTOP
8835 ENSG00000262771_ENST00000570667_ENSP00000460028 [5, 1, 5, 5, 5] [5, 1, 5, 5, 5] [2, 1, 2, 2, 2] 4 4 1 SSBP1
1062 ENSG00000085063_ENST00000426650_ENSP00000402425 [10, 15, 15, 16, 16] [2, 6, 6, 7, 7] [2, 2, 2, 3, 3] 6 5 1 CD59
7251 ENSG00000175470_ENST00000455566_ENSP00000399970 [14, 18, 20, 20, 19] [1, 5, 6, 6, 5] [1, 2, 2, 2, 2] 6 5 1 PPP2R2D
7926 ENSG00000196588_ENST00000355630_ENSP00000347847 [10, 15, 16, 16, 15] [3, 7, 8, 8, 7] [3, 3, 4, 4, 3] 6 5 1 MRTFA
668 ENSG00000070814_ENST00000377797_ENSP00000367028 [13, 20, 19, 18, 19] [3, 7, 7, 7, 7] [3, 4, 4, 4, 4] 7 4 1 TCOF1
634 ENSG00000069424_ENST00000378092_ENSP00000367332 [15, 14, 10, 14, 14] [6, 5, 1, 5, 5] [2, 1, 1, 1, 1] 5 5 1 KCNAB2
2135 ENSG00000106070_ENST00000403097_ENSP00000385544 [19, 18, 18, 13, 19] [6, 6, 6, 1, 6] [2, 2, 2, 1, 2] 6 5 1 GRB10
1066 ENSG00000085377_ENST00000369110_ENSP00000358106 [13, 13, 14, 14, 9] [7, 7, 8, 8, 3] [3, 3, 4, 4, 3] 5 5 1 PREP
7635 ENSG00000184047_ENST00000464942_ENSP00000442360 [17, 17, 19, 11, 18] [7, 7, 7, 2, 7] [2, 2, 2, 1, 2] 8 5 1 DIABLO



exploring relationship between variables related to 3953 IDs

importing files made ^

In [7]:
os.chdir("/Users/mariapalafox/Box Sync/CODE_DATA/dir_MAPpaper/TSV_UNIPROT_xref/MULTIMAPPING_IDS/MISMAP2.0/")

mer = pd.read_csv("RELEASE_LEVEL_3953_avgFRACMISSED_genename_3953rows.csv")
specmer = pd.read_csv("ENSP_LEVEL_all3953_genename_distanceMetrics_52417rows.csv")
In [21]:
print(mer.shape)
print(specmer.shape)


# mer is 1 uniprot ID to many ENSP
# specmer is many ENSP to 1 UKB id
(3953, 29)
(52417, 36)
In [53]:
specmer_number = specmer[['hamming_normalized_dist', 'levenshtein_normalized_dist',  'missed_count','missed_frac',
       'release']]

# Length diff is ENSP - UKB # last number is more constant
specmer_number.columns = ['HammingN', 'LevenshteinN',  'Missed count','Missed frac',
       'Release']

mer_number = mer[['count_ENSP', 'count_ENSPv', 'count_ENST', 'count_ENSTv',
       'count_ENSG', 'count_ENSGv', 'count_proSequence',
       'dynamic_slope_scores', 'avg_frac_missed_releaseLevel']].copy()


mer_number.columns = ['ENSP count', 'ENSPv count', 'ENST count', 'ENSTv count',
       'ENSG count', 'ENSGv count', 'ProSequence count',
       'Mismap slope', 'Release mismap mean']
In [54]:
mer_number.head(2)
Out[54]:
ENSP count ENSPv count ENST count ENSTv count ENSG count ENSGv count ProSequence count Mismap slope Release mismap mean
0 2 2 2 4 1 3 2 1 0.409091
1 1 2 1 3 1 3 2 1 1.000000
In [55]:
specmer_number.head(2)
Out[55]:
HammingN LevenshteinN Missed count Missed frac Release
0 0.0 0.0 0 0.0 85
1 0.0 0.0 0 0.0 85
In [56]:
# change dynamic to category
specmer_number['Release'] = pd.Categorical(specmer_number.Release)
#specmer_number.dtypes
/Users/mariapalafox/anaconda3/lib/python3.7/site-packages/ipykernel_launcher.py:2: SettingWithCopyWarning:


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy

In [57]:
from pandas import DataFrame
import seaborn as sn
In [58]:
corr1 = mer_number.corr()
mask = np.zeros_like(corr1)
mask[np.triu_indices_from(mask)] = True
with sn.axes_style("white"):
    f, ax = plt.subplots(figsize=(7, 5))
    ax =sn.heatmap(corr1, annot=True, annot_kws={"size": 6},vmin=0, vmax=1, mask=mask, square=True)
    plt.subplots_adjust(top=1, bottom=0.5)
    #ax.set_ylim(len(8)-0.5, -0.5)
    # fix for mpl bug that cuts off top/bottom of seaborn viz
    b, t = plt.ylim() # discover the values for bottom and top
    b += 0.5 # Add 0.5 to the bottom
    t -= 0.5 # Subtract 0.5 from the top
    plt.ylim(b, t) # update the ylim(bottom, top) values

#plt.show()
plt.savefig('correlation_matrix_3953UKBIDs_multimapping_info.pdf', dpi=300, bbox_inches = "tight")
In [60]:
corr2 = specmer_number.corr()

mask = np.zeros_like(corr2)
mask[np.triu_indices_from(mask)] = True
with sn.axes_style("white"):
    f, ax = plt.subplots(figsize=(8, 6))
    ax = sn.heatmap(corr2, annot=True, annot_kws={"size": 6},vmin=0, vmax=1, mask=mask, square=True)
    plt.subplots_adjust(top=1, bottom=0.5)
    #ax.set_ylim(len(8)-0.5, -0.5)
    # fix for mpl bug that cuts off top/bottom of seaborn viz
    b, t = plt.ylim() # discover the values for bottom and top
    b += 0.5 # Add 0.5 to the bottom
    t -= 0.5 # Subtract 0.5 from the top
    plt.ylim(b, t) # update the ylim(bottom, top) values
    ax.set_xticklabels(ax.get_xticklabels(), rotation=90)
    ax.set_yticklabels(ax.get_yticklabels(), rotation=45)
#plt.show()

plt.savefig('correlation_matrix_ENSPIDs_52417distance2ukb_info.pdf', dpi=300, bbox_inches = "tight")
In [14]:
# numpy pandas figure
corr1 = mer_number.corr()
corr1.style.background_gradient(cmap='coolwarm').set_precision(2)
# https://stackoverflow.com/questions/29432629/plot-correlation-matrix-using-pandas
Out[14]:
count_ENSP count_ENSPv count_ENST count_ENSTv count_ENSG count_ENSGv count_proSequence dynamic_slope_scores avg_frac_missed_releaseLevel
count_ENSP 1 1 1 0.95 0.59 0.52 0.59 0.34 0.34
count_ENSPv 1 1 1 0.95 0.58 0.52 0.6 0.36 0.34
count_ENST 1 1 1 0.95 0.59 0.52 0.59 0.34 0.34
count_ENSTv 0.95 0.95 0.95 1 0.48 0.5 0.63 0.29 0.35
count_ENSG 0.59 0.58 0.59 0.48 1 0.85 0.0038 0.023 -0.031
count_ENSGv 0.52 0.52 0.52 0.5 0.85 1 -0.0035 0.048 -0.025
count_proSequence 0.59 0.6 0.59 0.63 0.0038 -0.0035 1 0.45 0.6
dynamic_slope_scores 0.34 0.36 0.34 0.29 0.023 0.048 0.45 1 0.43
avg_frac_missed_releaseLevel 0.34 0.34 0.34 0.35 -0.031 -0.025 0.6 0.43 1
In [157]:
proseq = px.scatter(mer, x= "count_proSequence", y="avg_frac_missed_releaseLevel", color="avg_frac_missed_releaseLevel")
proseq.show()
In [164]:
corr2 = specmer_number.corr()
corr2.style.background_gradient(cmap='coolwarm').set_precision(2)
Out[164]:
Length pro_ver tx_ver gen_ver identical_2UKBseq Length_UKB len_ensp_minus_ukb hamming_distance hamming_normalized_dist levenshtein_distance levenshtein_normalized_dist labeled_pos_count count_C_targets count_K_targets missed_count missed_count_C missed_count_K missed_frac
Length 1 0.041 0.048 0.14 -0.043 0.92 -0.14 0.57 0.013 0.22 -0.12 0.07 0.18 0.0035 0.078 0.12 0.044 0.014
pro_ver 0.041 1 0.7 0.15 0.15 0.025 0.025 -0.05 -0.17 -0.027 -0.14 0.006 0.027 -0.0056 -0.074 -0.084 -0.057 -0.15
tx_ver 0.048 0.7 1 0.43 0.11 0.03 0.027 -0.032 -0.12 -0.027 -0.12 0.02 0.031 0.01 -0.053 -0.061 -0.04 -0.11
gen_ver 0.14 0.15 0.43 1 -0.23 0.15 -0.084 0.14 0.19 0.093 0.13 0.028 0.074 -0.00043 0.092 0.12 0.064 0.18
identical_2UKBseq -0.043 0.15 0.11 -0.23 1 -0.1 0.16 -0.34 -0.84 -0.18 -0.56 0.021 -0.03 0.043 -0.39 -0.44 -0.3 -0.76
Length_UKB 0.92 0.025 0.03 0.15 -0.1 1 -0.53 0.82 0.11 0.65 0.1 0.065 0.16 0.0033 0.11 0.15 0.075 0.086
len_ensp_minus_ukb -0.14 0.025 0.027 -0.084 0.16 -0.53 1 -0.75 -0.2 -0.98 -0.4 -0.012 -0.03 -0.00079 -0.12 -0.13 -0.091 -0.18
hamming_distance 0.57 -0.05 -0.032 0.14 -0.34 0.82 -0.75 1 0.41 0.78 0.3 0.061 0.11 0.026 0.29 0.34 0.21 0.37
hamming_normalized_dist 0.013 -0.17 -0.12 0.19 -0.84 0.11 -0.2 0.41 1 0.22 0.61 0.0023 0.044 -0.019 0.48 0.53 0.38 0.91
levenshtein_distance 0.22 -0.027 -0.027 0.093 -0.18 0.65 -0.98 0.78 0.22 1 0.42 0.011 0.028 8.3e-06 0.12 0.13 0.097 0.19
levenshtein_normalized_dist -0.12 -0.14 -0.12 0.13 -0.56 0.1 -0.4 0.3 0.61 0.42 1 -0.045 -0.012 -0.052 0.23 0.26 0.18 0.54
labeled_pos_count 0.07 0.006 0.02 0.028 0.021 0.065 -0.012 0.061 0.0023 0.011 -0.045 1 0.71 0.94 0.55 0.4 0.55 -0.0021
count_C_targets 0.18 0.027 0.031 0.074 -0.03 0.16 -0.03 0.11 0.044 0.028 -0.012 0.71 1 0.43 0.43 0.53 0.31 0.044
count_K_targets 0.0035 -0.0056 0.01 -0.00043 0.043 0.0033 -0.00079 0.026 -0.019 8.3e-06 -0.052 0.94 0.43 1 0.49 0.25 0.55 -0.024
missed_count 0.078 -0.074 -0.053 0.092 -0.39 0.11 -0.12 0.29 0.48 0.12 0.23 0.55 0.43 0.49 1 0.82 0.95 0.48
missed_count_C 0.12 -0.084 -0.061 0.12 -0.44 0.15 -0.13 0.34 0.53 0.13 0.26 0.4 0.53 0.25 0.82 1 0.6 0.54
missed_count_K 0.044 -0.057 -0.04 0.064 -0.3 0.075 -0.091 0.21 0.38 0.097 0.18 0.55 0.31 0.55 0.95 0.6 1 0.37
missed_frac 0.014 -0.15 -0.11 0.18 -0.76 0.086 -0.18 0.37 0.91 0.19 0.54 -0.0021 0.044 -0.024 0.48 0.54 0.37 1
In [165]:
import plotly.graph_objects as go
corr2.write_image("specificrelease_correlation_matric.png")
---------------------------------------------------------------------------
AttributeError                            Traceback (most recent call last)
<ipython-input-165-ef341aab8a07> in <module>
      1 import plotly.graph_objects as go
----> 2 corr2.write_image("specificrelease_correlation_matric.png")

/anaconda3/lib/python3.7/site-packages/pandas/core/generic.py in __getattr__(self, name)
   4374             if self._info_axis._can_hold_identifiers_and_holds_name(name):
   4375                 return self[name]
-> 4376             return object.__getattribute__(self, name)
   4377 
   4378     def __setattr__(self, name, value):

AttributeError: 'DataFrame' object has no attribute 'write_image'
In [61]:
specmer_dist = specmer[['len_ensp_minus_ukb', 'hamming_normalized_dist',
       'levenshtein_normalized_dist', 'missed_frac','release']].copy()
In [62]:
fig = px.scatter_matrix(specmer_dist,
                        dimensions=['len_ensp_minus_ukb', 'hamming_normalized_dist',
       'levenshtein_normalized_dist', 'missed_frac'],color="release")

fig.update_traces(diagonal_visible=False)
fig.show()
In [63]:
ham = px.scatter(specmer_number, x= "hamming_normalized_dist", y="missed_frac", color="release")
ham.show()
---------------------------------------------------------------------------
ValueError                                Traceback (most recent call last)
<ipython-input-63-b22599b53d0c> in <module>
----> 1 ham = px.scatter(specmer_number, x= "hamming_normalized_dist", y="missed_frac", color="release")
      2 ham.show()

~/anaconda3/lib/python3.7/site-packages/plotly/express/_chart_types.py in scatter(data_frame, x, y, color, symbol, size, hover_name, hover_data, custom_data, text, facet_row, facet_col, facet_col_wrap, error_x, error_x_minus, error_y, error_y_minus, animation_frame, animation_group, category_orders, labels, color_discrete_sequence, color_discrete_map, color_continuous_scale, range_color, color_continuous_midpoint, symbol_sequence, symbol_map, opacity, size_max, marginal_x, marginal_y, trendline, trendline_color_override, log_x, log_y, range_x, range_y, render_mode, title, template, width, height)
     53     mark in 2D space.
     54     """
---> 55     return make_figure(args=locals(), constructor=go.Scatter)
     56 
     57 

~/anaconda3/lib/python3.7/site-packages/plotly/express/_core.py in make_figure(args, constructor, trace_patch, layout_patch)
   1170 
   1171     args, trace_specs, grouped_mappings, sizeref, show_colorbar = infer_config(
-> 1172         args, constructor, trace_patch
   1173     )
   1174     grouper = [x.grouper or one_group for x in grouped_mappings] or [one_group]

~/anaconda3/lib/python3.7/site-packages/plotly/express/_core.py in infer_config(args, constructor, trace_patch)
   1026             all_attrables += [group_attr]
   1027 
-> 1028     args = build_dataframe(args, all_attrables, array_attrables)
   1029 
   1030     attrs = [k for k in attrables if k in args]

~/anaconda3/lib/python3.7/site-packages/plotly/express/_core.py in build_dataframe(args, attrables, array_attrables)
    944                             "\n To use the index, pass it in directly as `df.index`."
    945                         )
--> 946                     raise ValueError(err_msg)
    947                 if length and len(df_input[argument]) != length:
    948                     raise ValueError(

ValueError: Value of 'x' is not the name of a column in 'data_frame'. Expected one of ['HammingN', 'LevenshteinN', 'Missed count', 'Missed frac', 'Release'] but received: hamming_normalized_dist
In [162]:
fig = px.scatter(specmer_number, x="hamming_normalized_dist", y="missed_frac", color="release", marginal_y="violin",
           marginal_x="box")
fig.show()

PLOTLY guide to scatter matrix : https://plot.ly/python/splom/

In [16]:
dfnum.columns = ['dynamic slope', 'avg mismap', 'mean ENSP',
       'SD ENSP', 'total pos', 'UKB C abun',
       'UKB K abun', 'UKB length']
In [62]:
dfnum.columns = ['dynamic slope', 'avg mismap', 'mean ENSP',
       'SD ENSP', 'total pos', 'UKB C abun',
       'UKB K abun', 'UKB length']

fig3 = px.parallel_coordinates(dfnum, color="avg mismap",
                    color_continuous_scale=px.colors.diverging.Tealrose, color_continuous_midpoint=0.5)
fig3.show()
In [64]:
lines1 = px.parallel_coordinates(specmer_dist, color="release",
                    color_continuous_scale=px.colors.diverging.Tealrose, color_continuous_midpoint=2)
lines1.show()
In [ ]: